blob: b3c66200c6b2285de5fd5cbf171d977d2edb259f [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
Alex Perrycb7da4b2019-08-28 19:35:56 -07003#include <cerrno>
Brian Silverman4c7235a2021-11-17 19:04:37 -08004#include <limits>
Alex Perrycb7da4b2019-08-28 19:35:56 -07005
Austin Schuhd7e252d2019-10-06 13:51:02 -07006namespace aos {
7
8void Tokenizer::ConsumeWhitespace() {
9 while (true) {
10 if (AtEnd()) {
11 return;
12 }
13 // Skip any whitespace.
14 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
15 ConsumeChar();
16 } else if (Char() == '\n') {
17 ConsumeChar();
18 ++linenumber_;
Austin Schuh81da4b22019-10-06 14:03:24 -070019 } else if (Consume("/*")) {
20 while (!Consume("*/")) {
21 if (Char() == '\n') {
22 ++linenumber_;
23 }
24 ConsumeChar();
25 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070026 } else {
27 // There is no fail. Once we are out of whitespace (including 0 of it),
28 // declare success.
29 return;
30 }
31 }
32}
33
34bool Tokenizer::Consume(const char *token) {
James Kuszmaul3ae42262019-11-08 12:33:41 -080035 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070036 while (true) {
37 // Finishing the token is success.
38 if (*token == '\0') {
39 return true;
40 }
41
42 // But finishing the data first is failure.
43 if (AtEnd()) {
44 data_ = original;
45 return false;
46 }
47
48 // Missmatch is failure.
49 if (*token != Char()) {
50 data_ = original;
51 return false;
52 }
53
54 ConsumeChar();
55 ++token;
56 }
57}
58
59bool Tokenizer::ConsumeString(::std::string *s) {
60 // Under no conditions is it acceptible to run out of data while parsing a
61 // string. Any AtEnd checks should confirm that.
James Kuszmaul3ae42262019-11-08 12:33:41 -080062 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070063 if (AtEnd()) {
64 return false;
65 }
66
67 // Expect the leading "
68 if (Char() != '"') {
69 return false;
70 }
71
72 ConsumeChar();
James Kuszmaul3ae42262019-11-08 12:33:41 -080073 std::string_view last_parsed_data = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070074 *s = ::std::string();
75
76 while (true) {
77 if (AtEnd()) {
78 data_ = original;
79 return false;
80 }
81
82 // If we get an end or an escape, do something special.
83 if (Char() == '"' || Char() == '\\') {
84 // Save what we found up until now, not including this character.
85 *s += ::std::string(
86 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
87
88 // Update the pointer.
89 last_parsed_data = data_;
90
91 // " is the end, declare victory.
92 if (Char() == '"') {
93 ConsumeChar();
Pallavi Madhukare2eb2812022-07-19 09:56:09 -070094 if (unicode_high_surrogate_ != -1) {
95 fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
96 data_ = original;
97 return false;
98 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070099 return true;
100 } else {
101 ConsumeChar();
102 // Now consume valid escape characters and add their representation onto
103 // the output string.
104 if (AtEnd()) {
105 data_ = original;
106 return false;
107 } else if (Char() == '"') {
108 *s += "\"";
109 } else if (Char() == '\\') {
110 *s += "\\";
111 } else if (Char() == '/') {
112 *s += "/";
113 } else if (Char() == 'b') {
114 *s += "\b";
115 } else if (Char() == 'f') {
116 *s += "\f";
117 } else if (Char() == 'n') {
118 *s += "\n";
119 } else if (Char() == 'r') {
120 *s += "\r";
121 } else if (Char() == 't') {
122 *s += "\t";
123 } else if (Char() == 'u') {
Pallavi Madhukare2eb2812022-07-19 09:56:09 -0700124 if (!ConsumeUnicode(s)) {
125 fprintf(stderr, "Invalid unicode on line %d\n", linenumber_);
126 data_ = original;
127 return false;
128 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700129 }
130 }
131 // And skip the escaped character.
132 last_parsed_data = data_.substr(1);
133 }
134
135 ConsumeChar();
136 }
137}
138
Pallavi Madhukare2eb2812022-07-19 09:56:09 -0700139bool Tokenizer::ConsumeUnicode(::std::string *s) {
140 // Under no conditions is it acceptible to run out of data while parsing a
141 // unicode. Any AtEnd checks should confirm that.
142 uint32_t val;
143
144 // Consume unicode representation
145 ConsumeChar();
146
147 char target[5];
148
149 // Valid unicode is 4 hex digits so evaluate the next 4 characters
150 for (int count = 0; count < 4; count++) {
151 // If there is no data or data is an invalid char, return false
152 if (AtEnd()) {
153 return false;
154 }
155
156 if (!isxdigit(Char())) {
157 return false;
158 }
159
160 target[count] = Char();
161
162 // Do not consume the last character
163 if (count == 3) {
164 break;
165 }
166
167 ConsumeChar();
168 }
169 target[4] = '\0';
170
171 // References: flatbuffers/src/idl_parser.cpp
172 val = flatbuffers::StringToUInt(target, 16);
173
174 if (val >= 0xD800 && val <= 0xDBFF) {
175 if (unicode_high_surrogate_ != -1) {
176 fprintf(stderr, "Invalid unicode - Multiple high surrogates\n");
177 return false;
178 } else {
179 unicode_high_surrogate_ = static_cast<int>(val);
180 }
181 } else if (val >= 0xDC00 && val <= 0xDFFF) {
182 if (unicode_high_surrogate_ == -1) {
183 fprintf(stderr, "Invalid unicode - Unpaired low surrogate\n");
184 return false;
185 } else {
186 int code_point =
187 0x10000 + ((unicode_high_surrogate_ & 0x03FF) << 10) + (val & 0x03FF);
188 flatbuffers::ToUTF8(code_point, s);
189 unicode_high_surrogate_ = -1;
190 }
191 } else {
192 if (unicode_high_surrogate_ != -1) {
193 fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
194 return false;
195 }
196 flatbuffers::ToUTF8(static_cast<int>(val), s);
197 }
198 return true;
199}
200
Austin Schuhd7e252d2019-10-06 13:51:02 -0700201bool Tokenizer::ConsumeNumber(::std::string *s) {
202 // Under no conditions is it acceptible to run out of data while parsing a
203 // number. Any AtEnd() checks should confirm that.
204 *s = ::std::string();
James Kuszmaul3ae42262019-11-08 12:33:41 -0800205 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -0700206
207 // Consume the leading - unconditionally.
208 Consume("-");
209
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800210 // See if we find nan. This isn't standards compliant, but is what
211 // flatbuffers prints out, so we need to parse it.
212 if (Consume("nan")) {
213 *s = ::std::string(original.substr(0, original.size() - data_.size()));
214 return true;
215 }
216
Brian Silverman714b1d62020-04-28 16:52:54 -0700217 // People tend to use null instead of nan. Accept that too.
218 if (Consume("null")) {
219 *s = ::std::string("nan");
220 return true;
221 }
222
Austin Schuh9fa0b8e2021-03-21 19:21:50 -0700223 // Inf is also acceptable.
224 if (Consume("inf")) {
225 *s = ::std::string(original.substr(0, original.size() - data_.size()));
226 return true;
227 }
228
Austin Schuhd7e252d2019-10-06 13:51:02 -0700229 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
230 // by a second number.
231 if (!Consume("0")) {
232 if (AtEnd()) {
233 return false;
234 } else if (Char() >= '1' && Char() <= '9') {
235 // This wasn't a zero, but was a valid digit. Consume it.
236 ConsumeChar();
237 } else {
238 return false;
239 }
240
241 // Now consume any number of any digits.
242 while (true) {
243 if (AtEnd()) {
244 data_ = original;
245 return false;
246 }
247 if (Char() < '0' || Char() > '9') {
248 break;
249 }
250 ConsumeChar();
251 }
252 }
253
254 // We could now have a decimal.
255 if (Char() == '.') {
256 ConsumeChar();
257 while (true) {
258 if (AtEnd()) {
259 data_ = original;
260 return false;
261 }
262 // And any number of digits.
263 if (Char() < '0' || Char() > '9') {
264 break;
265 }
266 ConsumeChar();
267 }
268 }
269
270 // And now an exponent.
271 if (Char() == 'e' || Char() == 'E') {
272 ConsumeChar();
273 if (AtEnd()) {
274 data_ = original;
275 return false;
276 }
277
278 // Which could have a +-
279 if (Char() == '+' || Char() == '-') {
280 ConsumeChar();
281 }
282 int count = 0;
283 while (true) {
284 if (AtEnd()) {
285 data_ = original;
286 return false;
287 }
288 // And digits.
289 if (Char() < '0' || Char() > '9') {
290 break;
291 }
292 ConsumeChar();
293 ++count;
294 }
295 // But, it is an error to have an exponent and nothing following it.
296 if (count == 0) {
297 data_ = original;
298 return false;
299 }
300 }
301
302 *s = ::std::string(original.substr(0, original.size() - data_.size()));
303 return true;
304}
305
306Tokenizer::TokenType Tokenizer::Next() {
307 switch (state_) {
308 case State::kExpectObjectStart:
309 // We should always start out with a {
Austin Schuh6f896702020-03-19 16:07:20 -0700310 if (!Consume("{")) {
311 fprintf(stderr, "Error on line %d, expected { for start.\n",
312 linenumber_);
313 return TokenType::kError;
314 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700315
316 // Document that we just started an object.
317 object_type_.push_back(ObjectType::kObject);
318
319 ConsumeWhitespace();
320
321 if (Consume("}")) {
322 ConsumeWhitespace();
323 state_ = State::kExpectObjectEnd;
324 } else {
325 state_ = State::kExpectField;
326 }
327 return TokenType::kStartObject;
328
329 case State::kExpectField: {
330 // Fields are built up of strings, whitespace, and then a : (followed by
331 // whitespace...)
332 ::std::string s;
333 if (!ConsumeString(&s)) {
334 fprintf(stderr, "Error on line %d, expected string for field name.\n",
335 linenumber_);
Austin Schuh92700842019-12-28 13:00:17 -0800336 if (Consume("}")) {
337 fprintf(stderr,
338 "Got '}' instead. Did you add an extra trailing ','?\n");
339 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700340 return TokenType::kError;
341 }
342 field_name_ = ::std::move(s);
343
344 ConsumeWhitespace();
345
346 if (!Consume(":")) {
Austin Schuh2595a142020-11-29 22:43:57 -0800347 fprintf(stderr, "Error on line %d, expected ':', got '%c'\n",
348 linenumber_, Char());
Austin Schuhd7e252d2019-10-06 13:51:02 -0700349 return TokenType::kError;
350 }
351
352 ConsumeWhitespace();
353
354 state_ = State::kExpectValue;
355
356 return TokenType::kField;
357 } break;
358 case State::kExpectValue: {
359 TokenType result = TokenType::kError;
360
361 ::std::string s;
362 if (Consume("{")) {
363 // Fields are in objects. Record and recurse.
364 object_type_.push_back(ObjectType::kObject);
365
366 ConsumeWhitespace();
367
Alex Perrycb7da4b2019-08-28 19:35:56 -0700368 // And then if we encounter the end again, go to the end state.
369 if (Consume("}")) {
370 ConsumeWhitespace();
371 state_ = State::kExpectObjectEnd;
372 } else {
373 state_ = State::kExpectField;
374 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700375 return TokenType::kStartObject;
376 } else if (Consume("[")) {
377 // Values are in arrays. Record and recurse.
378 object_type_.push_back(ObjectType::kArray);
379
380 ConsumeWhitespace();
381 state_ = State::kExpectValue;
382 return TokenType::kStartArray;
383 } else if (ConsumeString(&s)) {
384 // Parsed as a string, grab it.
385 field_value_ = ::std::move(s);
386 result = TokenType::kStringValue;
387 } else if (ConsumeNumber(&s)) {
388 // Parsed as a number, grab it.
389 field_value_ = ::std::move(s);
390 result = TokenType::kNumberValue;
391 } else if (Consume("true")) {
392 // Parsed as a true, grab it.
393 field_value_ = "true";
394 result = TokenType::kTrueValue;
395 } else if (Consume("false")) {
396 // Parsed as a false, grab it.
397 field_value_ = "false";
398 result = TokenType::kFalseValue;
399 } else {
Alex Perrycb7da4b2019-08-28 19:35:56 -0700400 switch (object_type_.back()) {
401 case ObjectType::kObject:
402 if (Consume("}")) {
403 ConsumeWhitespace();
404 state_ = State::kExpectObjectEnd;
405 return Next();
406 }
407 break;
408 case ObjectType::kArray:
409 if (Consume("]")) {
410 ConsumeWhitespace();
411 state_ = State::kExpectArrayEnd;
412 return Next();
413 }
414 break;
415 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700416 // Couldn't parse, so we have a syntax error.
417 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
418 }
419
420 ConsumeWhitespace();
421
422 // After a field, we either have a , and another field (or value if we are
423 // in an array), or we should be closing out the object (or array).
424 if (Consume(",")) {
425 ConsumeWhitespace();
426 switch (object_type_.back()) {
427 case ObjectType::kObject:
428 state_ = State::kExpectField;
429 break;
430 case ObjectType::kArray:
431 state_ = State::kExpectValue;
432 break;
433 }
434 } else {
435 // Sanity check that the stack is deep enough.
436 if (object_type_.size() == 0) {
437 fprintf(stderr, "Error on line %d\n", linenumber_);
438 return TokenType::kError;
439 }
440
441 // And then require closing out the object.
442 switch (object_type_.back()) {
443 case ObjectType::kObject:
444 if (Consume("}")) {
445 ConsumeWhitespace();
446 state_ = State::kExpectObjectEnd;
447 } else {
Austin Schuh60e77942022-05-16 17:48:24 -0700448 fprintf(stderr, "Error on line %d, expected } or ,\n",
449 linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700450 return TokenType::kError;
451 }
452 break;
453 case ObjectType::kArray:
454 if (Consume("]")) {
455 ConsumeWhitespace();
456 state_ = State::kExpectArrayEnd;
457 } else {
Austin Schuh60e77942022-05-16 17:48:24 -0700458 fprintf(stderr, "Error on line %d, expected ] or ,\n",
459 linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700460 return TokenType::kError;
461 }
462 break;
463 }
464 }
465 return result;
466 } break;
467
468 case State::kExpectArrayEnd:
469 case State::kExpectObjectEnd: {
470 const TokenType result = state_ == State::kExpectArrayEnd
471 ? TokenType::kEndArray
472 : TokenType::kEndObject;
473 // This is a transient state so we can send 2 tokens out in a row. We
474 // discover the object or array end at the end of reading the value.
475 object_type_.pop_back();
476 if (object_type_.size() == 0) {
477 // We unwound the outer object. We should send kEnd next.
478 state_ = State::kExpectEnd;
479 } else if (object_type_.back() == ObjectType::kObject) {
480 // If we are going into an object, it should either have another field
481 // or end.
482 if (Consume(",")) {
483 ConsumeWhitespace();
484 state_ = State::kExpectField;
485 } else if (Consume("}")) {
486 ConsumeWhitespace();
487 state_ = State::kExpectObjectEnd;
488 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800489 fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700490 return TokenType::kError;
491 }
492 } else if (object_type_.back() == ObjectType::kArray) {
493 // If we are going into an array, it should either have another value
494 // or end.
495 if (Consume(",")) {
496 ConsumeWhitespace();
497 state_ = State::kExpectValue;
498 } else if (Consume("]")) {
499 ConsumeWhitespace();
500 state_ = State::kExpectArrayEnd;
501 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800502 fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700503 return TokenType::kError;
504 }
505 }
506 // And then send out the correct token.
507 return result;
508 }
509 case State::kExpectEnd:
510 // If we are supposed to be done, confirm nothing is after the end.
511 if (AtEnd()) {
512 return TokenType::kEnd;
513 } else {
514 fprintf(stderr, "Data past end at line %d\n", linenumber_);
515 return TokenType::kError;
516 }
517 }
518 return TokenType::kError;
519}
520
521bool Tokenizer::FieldAsInt(long long *value) {
522 const char *pos = field_value().c_str();
523 errno = 0;
524 *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
525 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
526 return false;
527 }
528 return true;
529}
530
531bool Tokenizer::FieldAsDouble(double *value) {
532 const char *pos = field_value().c_str();
533 errno = 0;
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800534 if (field_value() == "nan") {
535 *value = std::numeric_limits<double>::quiet_NaN();
536 return true;
537 } else if (field_value() == "-nan") {
538 *value = -std::numeric_limits<double>::quiet_NaN();
539 return true;
540 }
541
Austin Schuh9fa0b8e2021-03-21 19:21:50 -0700542 if (field_value() == "inf") {
543 *value = std::numeric_limits<double>::infinity();
544 return true;
545 } else if (field_value() == "-inf") {
546 *value = -std::numeric_limits<double>::infinity();
547 return true;
548 }
549
Austin Schuhd7e252d2019-10-06 13:51:02 -0700550 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
551
552 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
553 return false;
554 }
555 return true;
556}
557
558} // namespace aos