blob: 32c92475278196e9f1419ac3f88bf0796ee40693 [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
Alex Perrycb7da4b2019-08-28 19:35:56 -07003#include <cerrno>
Brian Silverman4c7235a2021-11-17 19:04:37 -08004#include <limits>
Alex Perrycb7da4b2019-08-28 19:35:56 -07005
Austin Schuhd7e252d2019-10-06 13:51:02 -07006namespace aos {
7
8void Tokenizer::ConsumeWhitespace() {
9 while (true) {
10 if (AtEnd()) {
11 return;
12 }
13 // Skip any whitespace.
14 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
15 ConsumeChar();
16 } else if (Char() == '\n') {
17 ConsumeChar();
18 ++linenumber_;
Austin Schuh81da4b22019-10-06 14:03:24 -070019 } else if (Consume("/*")) {
20 while (!Consume("*/")) {
21 if (Char() == '\n') {
22 ++linenumber_;
23 }
24 ConsumeChar();
25 }
Stephan Pleines89836852023-09-15 20:11:57 -070026 } else if (Consume("//")) {
27 // C++ style comment. Keep consuming chars until newline, or until the
28 // end of the file if this is the last line (no newline at end of file).
29 while (true) {
Brian J Griglak2e16e7b2024-03-01 12:10:46 -070030 // First check if we are at the end of the file.
Stephan Pleines89836852023-09-15 20:11:57 -070031 if (AtEnd()) {
32 return;
33 }
Brian J Griglak2e16e7b2024-03-01 12:10:46 -070034 // Then check if we are at the end of the line.
Stephan Pleines89836852023-09-15 20:11:57 -070035 if (Char() == '\n') {
36 ++linenumber_;
37 break;
38 }
Brian J Griglak2e16e7b2024-03-01 12:10:46 -070039 // Advance to next character and repeat.
40 ConsumeChar();
Stephan Pleines89836852023-09-15 20:11:57 -070041 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070042 } else {
43 // There is no fail. Once we are out of whitespace (including 0 of it),
44 // declare success.
45 return;
46 }
47 }
48}
49
50bool Tokenizer::Consume(const char *token) {
James Kuszmaul3ae42262019-11-08 12:33:41 -080051 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070052 while (true) {
53 // Finishing the token is success.
54 if (*token == '\0') {
55 return true;
56 }
57
58 // But finishing the data first is failure.
59 if (AtEnd()) {
60 data_ = original;
61 return false;
62 }
63
64 // Missmatch is failure.
65 if (*token != Char()) {
66 data_ = original;
67 return false;
68 }
69
70 ConsumeChar();
71 ++token;
72 }
73}
74
75bool Tokenizer::ConsumeString(::std::string *s) {
76 // Under no conditions is it acceptible to run out of data while parsing a
77 // string. Any AtEnd checks should confirm that.
James Kuszmaul3ae42262019-11-08 12:33:41 -080078 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070079 if (AtEnd()) {
80 return false;
81 }
82
83 // Expect the leading "
84 if (Char() != '"') {
85 return false;
86 }
87
88 ConsumeChar();
James Kuszmaul3ae42262019-11-08 12:33:41 -080089 std::string_view last_parsed_data = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070090 *s = ::std::string();
91
92 while (true) {
93 if (AtEnd()) {
94 data_ = original;
95 return false;
96 }
97
98 // If we get an end or an escape, do something special.
99 if (Char() == '"' || Char() == '\\') {
100 // Save what we found up until now, not including this character.
101 *s += ::std::string(
102 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
103
104 // Update the pointer.
105 last_parsed_data = data_;
106
107 // " is the end, declare victory.
108 if (Char() == '"') {
109 ConsumeChar();
Pallavi Madhukare2eb2812022-07-19 09:56:09 -0700110 if (unicode_high_surrogate_ != -1) {
111 fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
112 data_ = original;
113 return false;
114 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700115 return true;
116 } else {
117 ConsumeChar();
118 // Now consume valid escape characters and add their representation onto
119 // the output string.
120 if (AtEnd()) {
121 data_ = original;
122 return false;
123 } else if (Char() == '"') {
124 *s += "\"";
125 } else if (Char() == '\\') {
126 *s += "\\";
127 } else if (Char() == '/') {
128 *s += "/";
129 } else if (Char() == 'b') {
130 *s += "\b";
131 } else if (Char() == 'f') {
132 *s += "\f";
133 } else if (Char() == 'n') {
134 *s += "\n";
135 } else if (Char() == 'r') {
136 *s += "\r";
137 } else if (Char() == 't') {
138 *s += "\t";
139 } else if (Char() == 'u') {
Pallavi Madhukare2eb2812022-07-19 09:56:09 -0700140 if (!ConsumeUnicode(s)) {
141 fprintf(stderr, "Invalid unicode on line %d\n", linenumber_);
142 data_ = original;
143 return false;
144 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700145 }
146 }
147 // And skip the escaped character.
148 last_parsed_data = data_.substr(1);
149 }
150
151 ConsumeChar();
152 }
153}
154
Pallavi Madhukare2eb2812022-07-19 09:56:09 -0700155bool Tokenizer::ConsumeUnicode(::std::string *s) {
156 // Under no conditions is it acceptible to run out of data while parsing a
157 // unicode. Any AtEnd checks should confirm that.
158 uint32_t val;
159
160 // Consume unicode representation
161 ConsumeChar();
162
163 char target[5];
164
165 // Valid unicode is 4 hex digits so evaluate the next 4 characters
166 for (int count = 0; count < 4; count++) {
167 // If there is no data or data is an invalid char, return false
168 if (AtEnd()) {
169 return false;
170 }
171
172 if (!isxdigit(Char())) {
173 return false;
174 }
175
176 target[count] = Char();
177
178 // Do not consume the last character
179 if (count == 3) {
180 break;
181 }
182
183 ConsumeChar();
184 }
185 target[4] = '\0';
186
187 // References: flatbuffers/src/idl_parser.cpp
188 val = flatbuffers::StringToUInt(target, 16);
189
190 if (val >= 0xD800 && val <= 0xDBFF) {
191 if (unicode_high_surrogate_ != -1) {
192 fprintf(stderr, "Invalid unicode - Multiple high surrogates\n");
193 return false;
194 } else {
195 unicode_high_surrogate_ = static_cast<int>(val);
196 }
197 } else if (val >= 0xDC00 && val <= 0xDFFF) {
198 if (unicode_high_surrogate_ == -1) {
199 fprintf(stderr, "Invalid unicode - Unpaired low surrogate\n");
200 return false;
201 } else {
202 int code_point =
203 0x10000 + ((unicode_high_surrogate_ & 0x03FF) << 10) + (val & 0x03FF);
204 flatbuffers::ToUTF8(code_point, s);
205 unicode_high_surrogate_ = -1;
206 }
207 } else {
208 if (unicode_high_surrogate_ != -1) {
209 fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
210 return false;
211 }
212 flatbuffers::ToUTF8(static_cast<int>(val), s);
213 }
214 return true;
215}
216
Austin Schuhd7e252d2019-10-06 13:51:02 -0700217bool Tokenizer::ConsumeNumber(::std::string *s) {
218 // Under no conditions is it acceptible to run out of data while parsing a
219 // number. Any AtEnd() checks should confirm that.
220 *s = ::std::string();
James Kuszmaul3ae42262019-11-08 12:33:41 -0800221 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -0700222
223 // Consume the leading - unconditionally.
224 Consume("-");
225
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800226 // See if we find nan. This isn't standards compliant, but is what
227 // flatbuffers prints out, so we need to parse it.
228 if (Consume("nan")) {
229 *s = ::std::string(original.substr(0, original.size() - data_.size()));
230 return true;
231 }
232
Brian Silverman714b1d62020-04-28 16:52:54 -0700233 // People tend to use null instead of nan. Accept that too.
234 if (Consume("null")) {
235 *s = ::std::string("nan");
236 return true;
237 }
238
Austin Schuh9fa0b8e2021-03-21 19:21:50 -0700239 // Inf is also acceptable.
240 if (Consume("inf")) {
241 *s = ::std::string(original.substr(0, original.size() - data_.size()));
242 return true;
243 }
244
Austin Schuhd7e252d2019-10-06 13:51:02 -0700245 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
246 // by a second number.
247 if (!Consume("0")) {
248 if (AtEnd()) {
249 return false;
250 } else if (Char() >= '1' && Char() <= '9') {
251 // This wasn't a zero, but was a valid digit. Consume it.
252 ConsumeChar();
253 } else {
254 return false;
255 }
256
257 // Now consume any number of any digits.
258 while (true) {
259 if (AtEnd()) {
260 data_ = original;
261 return false;
262 }
263 if (Char() < '0' || Char() > '9') {
264 break;
265 }
266 ConsumeChar();
267 }
268 }
269
270 // We could now have a decimal.
271 if (Char() == '.') {
272 ConsumeChar();
273 while (true) {
274 if (AtEnd()) {
275 data_ = original;
276 return false;
277 }
278 // And any number of digits.
279 if (Char() < '0' || Char() > '9') {
280 break;
281 }
282 ConsumeChar();
283 }
284 }
285
286 // And now an exponent.
287 if (Char() == 'e' || Char() == 'E') {
288 ConsumeChar();
289 if (AtEnd()) {
290 data_ = original;
291 return false;
292 }
293
294 // Which could have a +-
295 if (Char() == '+' || Char() == '-') {
296 ConsumeChar();
297 }
298 int count = 0;
299 while (true) {
300 if (AtEnd()) {
301 data_ = original;
302 return false;
303 }
304 // And digits.
305 if (Char() < '0' || Char() > '9') {
306 break;
307 }
308 ConsumeChar();
309 ++count;
310 }
311 // But, it is an error to have an exponent and nothing following it.
312 if (count == 0) {
313 data_ = original;
314 return false;
315 }
316 }
317
318 *s = ::std::string(original.substr(0, original.size() - data_.size()));
319 return true;
320}
321
322Tokenizer::TokenType Tokenizer::Next() {
323 switch (state_) {
324 case State::kExpectObjectStart:
325 // We should always start out with a {
Austin Schuh6f896702020-03-19 16:07:20 -0700326 if (!Consume("{")) {
327 fprintf(stderr, "Error on line %d, expected { for start.\n",
328 linenumber_);
329 return TokenType::kError;
330 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700331
332 // Document that we just started an object.
333 object_type_.push_back(ObjectType::kObject);
334
335 ConsumeWhitespace();
336
337 if (Consume("}")) {
338 ConsumeWhitespace();
339 state_ = State::kExpectObjectEnd;
340 } else {
341 state_ = State::kExpectField;
342 }
343 return TokenType::kStartObject;
344
345 case State::kExpectField: {
346 // Fields are built up of strings, whitespace, and then a : (followed by
347 // whitespace...)
348 ::std::string s;
349 if (!ConsumeString(&s)) {
350 fprintf(stderr, "Error on line %d, expected string for field name.\n",
351 linenumber_);
Austin Schuh92700842019-12-28 13:00:17 -0800352 if (Consume("}")) {
353 fprintf(stderr,
354 "Got '}' instead. Did you add an extra trailing ','?\n");
355 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700356 return TokenType::kError;
357 }
358 field_name_ = ::std::move(s);
359
360 ConsumeWhitespace();
361
362 if (!Consume(":")) {
Austin Schuh2595a142020-11-29 22:43:57 -0800363 fprintf(stderr, "Error on line %d, expected ':', got '%c'\n",
364 linenumber_, Char());
Austin Schuhd7e252d2019-10-06 13:51:02 -0700365 return TokenType::kError;
366 }
367
368 ConsumeWhitespace();
369
370 state_ = State::kExpectValue;
371
372 return TokenType::kField;
373 } break;
374 case State::kExpectValue: {
375 TokenType result = TokenType::kError;
376
377 ::std::string s;
378 if (Consume("{")) {
379 // Fields are in objects. Record and recurse.
380 object_type_.push_back(ObjectType::kObject);
381
382 ConsumeWhitespace();
383
Alex Perrycb7da4b2019-08-28 19:35:56 -0700384 // And then if we encounter the end again, go to the end state.
385 if (Consume("}")) {
386 ConsumeWhitespace();
387 state_ = State::kExpectObjectEnd;
388 } else {
389 state_ = State::kExpectField;
390 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700391 return TokenType::kStartObject;
392 } else if (Consume("[")) {
393 // Values are in arrays. Record and recurse.
394 object_type_.push_back(ObjectType::kArray);
395
396 ConsumeWhitespace();
397 state_ = State::kExpectValue;
398 return TokenType::kStartArray;
399 } else if (ConsumeString(&s)) {
400 // Parsed as a string, grab it.
401 field_value_ = ::std::move(s);
402 result = TokenType::kStringValue;
403 } else if (ConsumeNumber(&s)) {
404 // Parsed as a number, grab it.
405 field_value_ = ::std::move(s);
406 result = TokenType::kNumberValue;
407 } else if (Consume("true")) {
408 // Parsed as a true, grab it.
409 field_value_ = "true";
410 result = TokenType::kTrueValue;
411 } else if (Consume("false")) {
412 // Parsed as a false, grab it.
413 field_value_ = "false";
414 result = TokenType::kFalseValue;
415 } else {
Alex Perrycb7da4b2019-08-28 19:35:56 -0700416 switch (object_type_.back()) {
417 case ObjectType::kObject:
418 if (Consume("}")) {
419 ConsumeWhitespace();
420 state_ = State::kExpectObjectEnd;
421 return Next();
422 }
423 break;
424 case ObjectType::kArray:
425 if (Consume("]")) {
426 ConsumeWhitespace();
427 state_ = State::kExpectArrayEnd;
428 return Next();
429 }
430 break;
431 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700432 // Couldn't parse, so we have a syntax error.
433 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
434 }
435
436 ConsumeWhitespace();
437
438 // After a field, we either have a , and another field (or value if we are
439 // in an array), or we should be closing out the object (or array).
440 if (Consume(",")) {
441 ConsumeWhitespace();
442 switch (object_type_.back()) {
443 case ObjectType::kObject:
444 state_ = State::kExpectField;
445 break;
446 case ObjectType::kArray:
447 state_ = State::kExpectValue;
448 break;
449 }
450 } else {
451 // Sanity check that the stack is deep enough.
452 if (object_type_.size() == 0) {
453 fprintf(stderr, "Error on line %d\n", linenumber_);
454 return TokenType::kError;
455 }
456
457 // And then require closing out the object.
458 switch (object_type_.back()) {
459 case ObjectType::kObject:
460 if (Consume("}")) {
461 ConsumeWhitespace();
462 state_ = State::kExpectObjectEnd;
463 } else {
Austin Schuh60e77942022-05-16 17:48:24 -0700464 fprintf(stderr, "Error on line %d, expected } or ,\n",
465 linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700466 return TokenType::kError;
467 }
468 break;
469 case ObjectType::kArray:
470 if (Consume("]")) {
471 ConsumeWhitespace();
472 state_ = State::kExpectArrayEnd;
473 } else {
Austin Schuh60e77942022-05-16 17:48:24 -0700474 fprintf(stderr, "Error on line %d, expected ] or ,\n",
475 linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700476 return TokenType::kError;
477 }
478 break;
479 }
480 }
481 return result;
482 } break;
483
484 case State::kExpectArrayEnd:
485 case State::kExpectObjectEnd: {
486 const TokenType result = state_ == State::kExpectArrayEnd
487 ? TokenType::kEndArray
488 : TokenType::kEndObject;
489 // This is a transient state so we can send 2 tokens out in a row. We
490 // discover the object or array end at the end of reading the value.
491 object_type_.pop_back();
492 if (object_type_.size() == 0) {
493 // We unwound the outer object. We should send kEnd next.
494 state_ = State::kExpectEnd;
495 } else if (object_type_.back() == ObjectType::kObject) {
496 // If we are going into an object, it should either have another field
497 // or end.
498 if (Consume(",")) {
499 ConsumeWhitespace();
500 state_ = State::kExpectField;
501 } else if (Consume("}")) {
502 ConsumeWhitespace();
503 state_ = State::kExpectObjectEnd;
504 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800505 fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700506 return TokenType::kError;
507 }
508 } else if (object_type_.back() == ObjectType::kArray) {
509 // If we are going into an array, it should either have another value
510 // or end.
511 if (Consume(",")) {
512 ConsumeWhitespace();
513 state_ = State::kExpectValue;
514 } else if (Consume("]")) {
515 ConsumeWhitespace();
516 state_ = State::kExpectArrayEnd;
517 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800518 fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700519 return TokenType::kError;
520 }
521 }
522 // And then send out the correct token.
523 return result;
524 }
525 case State::kExpectEnd:
526 // If we are supposed to be done, confirm nothing is after the end.
527 if (AtEnd()) {
528 return TokenType::kEnd;
529 } else {
530 fprintf(stderr, "Data past end at line %d\n", linenumber_);
531 return TokenType::kError;
532 }
533 }
534 return TokenType::kError;
535}
536
James Kuszmaul768c4682023-10-12 21:07:16 -0700537bool Tokenizer::FieldAsInt(absl::int128 *value) {
Austin Schuhd7e252d2019-10-06 13:51:02 -0700538 const char *pos = field_value().c_str();
James Kuszmaul768c4682023-10-12 21:07:16 -0700539 return absl::SimpleAtoi(pos, value);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700540}
541
542bool Tokenizer::FieldAsDouble(double *value) {
543 const char *pos = field_value().c_str();
544 errno = 0;
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800545 if (field_value() == "nan") {
546 *value = std::numeric_limits<double>::quiet_NaN();
547 return true;
548 } else if (field_value() == "-nan") {
549 *value = -std::numeric_limits<double>::quiet_NaN();
550 return true;
551 }
552
Austin Schuh9fa0b8e2021-03-21 19:21:50 -0700553 if (field_value() == "inf") {
554 *value = std::numeric_limits<double>::infinity();
555 return true;
556 } else if (field_value() == "-inf") {
557 *value = -std::numeric_limits<double>::infinity();
558 return true;
559 }
560
Austin Schuhd7e252d2019-10-06 13:51:02 -0700561 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
562
563 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
564 return false;
565 }
566 return true;
567}
568
569} // namespace aos