blob: eab7fccededf9b9082fd53073102d720e89ffcd3 [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
Alex Perrycb7da4b2019-08-28 19:35:56 -07003#include <cerrno>
Brian Silverman4c7235a2021-11-17 19:04:37 -08004#include <limits>
Alex Perrycb7da4b2019-08-28 19:35:56 -07005
Austin Schuhd7e252d2019-10-06 13:51:02 -07006namespace aos {
7
8void Tokenizer::ConsumeWhitespace() {
9 while (true) {
10 if (AtEnd()) {
11 return;
12 }
13 // Skip any whitespace.
14 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
15 ConsumeChar();
16 } else if (Char() == '\n') {
17 ConsumeChar();
18 ++linenumber_;
Austin Schuh81da4b22019-10-06 14:03:24 -070019 } else if (Consume("/*")) {
20 while (!Consume("*/")) {
21 if (Char() == '\n') {
22 ++linenumber_;
23 }
24 ConsumeChar();
25 }
Stephan Pleines89836852023-09-15 20:11:57 -070026 } else if (Consume("//")) {
27 // C++ style comment. Keep consuming chars until newline, or until the
28 // end of the file if this is the last line (no newline at end of file).
29 while (true) {
30 ConsumeChar();
31 if (AtEnd()) {
32 return;
33 }
34 if (Char() == '\n') {
35 ++linenumber_;
36 break;
37 }
38 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070039 } else {
40 // There is no fail. Once we are out of whitespace (including 0 of it),
41 // declare success.
42 return;
43 }
44 }
45}
46
47bool Tokenizer::Consume(const char *token) {
James Kuszmaul3ae42262019-11-08 12:33:41 -080048 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070049 while (true) {
50 // Finishing the token is success.
51 if (*token == '\0') {
52 return true;
53 }
54
55 // But finishing the data first is failure.
56 if (AtEnd()) {
57 data_ = original;
58 return false;
59 }
60
61 // Missmatch is failure.
62 if (*token != Char()) {
63 data_ = original;
64 return false;
65 }
66
67 ConsumeChar();
68 ++token;
69 }
70}
71
72bool Tokenizer::ConsumeString(::std::string *s) {
73 // Under no conditions is it acceptible to run out of data while parsing a
74 // string. Any AtEnd checks should confirm that.
James Kuszmaul3ae42262019-11-08 12:33:41 -080075 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070076 if (AtEnd()) {
77 return false;
78 }
79
80 // Expect the leading "
81 if (Char() != '"') {
82 return false;
83 }
84
85 ConsumeChar();
James Kuszmaul3ae42262019-11-08 12:33:41 -080086 std::string_view last_parsed_data = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070087 *s = ::std::string();
88
89 while (true) {
90 if (AtEnd()) {
91 data_ = original;
92 return false;
93 }
94
95 // If we get an end or an escape, do something special.
96 if (Char() == '"' || Char() == '\\') {
97 // Save what we found up until now, not including this character.
98 *s += ::std::string(
99 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
100
101 // Update the pointer.
102 last_parsed_data = data_;
103
104 // " is the end, declare victory.
105 if (Char() == '"') {
106 ConsumeChar();
Pallavi Madhukare2eb2812022-07-19 09:56:09 -0700107 if (unicode_high_surrogate_ != -1) {
108 fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
109 data_ = original;
110 return false;
111 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700112 return true;
113 } else {
114 ConsumeChar();
115 // Now consume valid escape characters and add their representation onto
116 // the output string.
117 if (AtEnd()) {
118 data_ = original;
119 return false;
120 } else if (Char() == '"') {
121 *s += "\"";
122 } else if (Char() == '\\') {
123 *s += "\\";
124 } else if (Char() == '/') {
125 *s += "/";
126 } else if (Char() == 'b') {
127 *s += "\b";
128 } else if (Char() == 'f') {
129 *s += "\f";
130 } else if (Char() == 'n') {
131 *s += "\n";
132 } else if (Char() == 'r') {
133 *s += "\r";
134 } else if (Char() == 't') {
135 *s += "\t";
136 } else if (Char() == 'u') {
Pallavi Madhukare2eb2812022-07-19 09:56:09 -0700137 if (!ConsumeUnicode(s)) {
138 fprintf(stderr, "Invalid unicode on line %d\n", linenumber_);
139 data_ = original;
140 return false;
141 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700142 }
143 }
144 // And skip the escaped character.
145 last_parsed_data = data_.substr(1);
146 }
147
148 ConsumeChar();
149 }
150}
151
Pallavi Madhukare2eb2812022-07-19 09:56:09 -0700152bool Tokenizer::ConsumeUnicode(::std::string *s) {
153 // Under no conditions is it acceptible to run out of data while parsing a
154 // unicode. Any AtEnd checks should confirm that.
155 uint32_t val;
156
157 // Consume unicode representation
158 ConsumeChar();
159
160 char target[5];
161
162 // Valid unicode is 4 hex digits so evaluate the next 4 characters
163 for (int count = 0; count < 4; count++) {
164 // If there is no data or data is an invalid char, return false
165 if (AtEnd()) {
166 return false;
167 }
168
169 if (!isxdigit(Char())) {
170 return false;
171 }
172
173 target[count] = Char();
174
175 // Do not consume the last character
176 if (count == 3) {
177 break;
178 }
179
180 ConsumeChar();
181 }
182 target[4] = '\0';
183
184 // References: flatbuffers/src/idl_parser.cpp
185 val = flatbuffers::StringToUInt(target, 16);
186
187 if (val >= 0xD800 && val <= 0xDBFF) {
188 if (unicode_high_surrogate_ != -1) {
189 fprintf(stderr, "Invalid unicode - Multiple high surrogates\n");
190 return false;
191 } else {
192 unicode_high_surrogate_ = static_cast<int>(val);
193 }
194 } else if (val >= 0xDC00 && val <= 0xDFFF) {
195 if (unicode_high_surrogate_ == -1) {
196 fprintf(stderr, "Invalid unicode - Unpaired low surrogate\n");
197 return false;
198 } else {
199 int code_point =
200 0x10000 + ((unicode_high_surrogate_ & 0x03FF) << 10) + (val & 0x03FF);
201 flatbuffers::ToUTF8(code_point, s);
202 unicode_high_surrogate_ = -1;
203 }
204 } else {
205 if (unicode_high_surrogate_ != -1) {
206 fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
207 return false;
208 }
209 flatbuffers::ToUTF8(static_cast<int>(val), s);
210 }
211 return true;
212}
213
Austin Schuhd7e252d2019-10-06 13:51:02 -0700214bool Tokenizer::ConsumeNumber(::std::string *s) {
215 // Under no conditions is it acceptible to run out of data while parsing a
216 // number. Any AtEnd() checks should confirm that.
217 *s = ::std::string();
James Kuszmaul3ae42262019-11-08 12:33:41 -0800218 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -0700219
220 // Consume the leading - unconditionally.
221 Consume("-");
222
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800223 // See if we find nan. This isn't standards compliant, but is what
224 // flatbuffers prints out, so we need to parse it.
225 if (Consume("nan")) {
226 *s = ::std::string(original.substr(0, original.size() - data_.size()));
227 return true;
228 }
229
Brian Silverman714b1d62020-04-28 16:52:54 -0700230 // People tend to use null instead of nan. Accept that too.
231 if (Consume("null")) {
232 *s = ::std::string("nan");
233 return true;
234 }
235
Austin Schuh9fa0b8e2021-03-21 19:21:50 -0700236 // Inf is also acceptable.
237 if (Consume("inf")) {
238 *s = ::std::string(original.substr(0, original.size() - data_.size()));
239 return true;
240 }
241
Austin Schuhd7e252d2019-10-06 13:51:02 -0700242 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
243 // by a second number.
244 if (!Consume("0")) {
245 if (AtEnd()) {
246 return false;
247 } else if (Char() >= '1' && Char() <= '9') {
248 // This wasn't a zero, but was a valid digit. Consume it.
249 ConsumeChar();
250 } else {
251 return false;
252 }
253
254 // Now consume any number of any digits.
255 while (true) {
256 if (AtEnd()) {
257 data_ = original;
258 return false;
259 }
260 if (Char() < '0' || Char() > '9') {
261 break;
262 }
263 ConsumeChar();
264 }
265 }
266
267 // We could now have a decimal.
268 if (Char() == '.') {
269 ConsumeChar();
270 while (true) {
271 if (AtEnd()) {
272 data_ = original;
273 return false;
274 }
275 // And any number of digits.
276 if (Char() < '0' || Char() > '9') {
277 break;
278 }
279 ConsumeChar();
280 }
281 }
282
283 // And now an exponent.
284 if (Char() == 'e' || Char() == 'E') {
285 ConsumeChar();
286 if (AtEnd()) {
287 data_ = original;
288 return false;
289 }
290
291 // Which could have a +-
292 if (Char() == '+' || Char() == '-') {
293 ConsumeChar();
294 }
295 int count = 0;
296 while (true) {
297 if (AtEnd()) {
298 data_ = original;
299 return false;
300 }
301 // And digits.
302 if (Char() < '0' || Char() > '9') {
303 break;
304 }
305 ConsumeChar();
306 ++count;
307 }
308 // But, it is an error to have an exponent and nothing following it.
309 if (count == 0) {
310 data_ = original;
311 return false;
312 }
313 }
314
315 *s = ::std::string(original.substr(0, original.size() - data_.size()));
316 return true;
317}
318
319Tokenizer::TokenType Tokenizer::Next() {
320 switch (state_) {
321 case State::kExpectObjectStart:
322 // We should always start out with a {
Austin Schuh6f896702020-03-19 16:07:20 -0700323 if (!Consume("{")) {
324 fprintf(stderr, "Error on line %d, expected { for start.\n",
325 linenumber_);
326 return TokenType::kError;
327 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700328
329 // Document that we just started an object.
330 object_type_.push_back(ObjectType::kObject);
331
332 ConsumeWhitespace();
333
334 if (Consume("}")) {
335 ConsumeWhitespace();
336 state_ = State::kExpectObjectEnd;
337 } else {
338 state_ = State::kExpectField;
339 }
340 return TokenType::kStartObject;
341
342 case State::kExpectField: {
343 // Fields are built up of strings, whitespace, and then a : (followed by
344 // whitespace...)
345 ::std::string s;
346 if (!ConsumeString(&s)) {
347 fprintf(stderr, "Error on line %d, expected string for field name.\n",
348 linenumber_);
Austin Schuh92700842019-12-28 13:00:17 -0800349 if (Consume("}")) {
350 fprintf(stderr,
351 "Got '}' instead. Did you add an extra trailing ','?\n");
352 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700353 return TokenType::kError;
354 }
355 field_name_ = ::std::move(s);
356
357 ConsumeWhitespace();
358
359 if (!Consume(":")) {
Austin Schuh2595a142020-11-29 22:43:57 -0800360 fprintf(stderr, "Error on line %d, expected ':', got '%c'\n",
361 linenumber_, Char());
Austin Schuhd7e252d2019-10-06 13:51:02 -0700362 return TokenType::kError;
363 }
364
365 ConsumeWhitespace();
366
367 state_ = State::kExpectValue;
368
369 return TokenType::kField;
370 } break;
371 case State::kExpectValue: {
372 TokenType result = TokenType::kError;
373
374 ::std::string s;
375 if (Consume("{")) {
376 // Fields are in objects. Record and recurse.
377 object_type_.push_back(ObjectType::kObject);
378
379 ConsumeWhitespace();
380
Alex Perrycb7da4b2019-08-28 19:35:56 -0700381 // And then if we encounter the end again, go to the end state.
382 if (Consume("}")) {
383 ConsumeWhitespace();
384 state_ = State::kExpectObjectEnd;
385 } else {
386 state_ = State::kExpectField;
387 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700388 return TokenType::kStartObject;
389 } else if (Consume("[")) {
390 // Values are in arrays. Record and recurse.
391 object_type_.push_back(ObjectType::kArray);
392
393 ConsumeWhitespace();
394 state_ = State::kExpectValue;
395 return TokenType::kStartArray;
396 } else if (ConsumeString(&s)) {
397 // Parsed as a string, grab it.
398 field_value_ = ::std::move(s);
399 result = TokenType::kStringValue;
400 } else if (ConsumeNumber(&s)) {
401 // Parsed as a number, grab it.
402 field_value_ = ::std::move(s);
403 result = TokenType::kNumberValue;
404 } else if (Consume("true")) {
405 // Parsed as a true, grab it.
406 field_value_ = "true";
407 result = TokenType::kTrueValue;
408 } else if (Consume("false")) {
409 // Parsed as a false, grab it.
410 field_value_ = "false";
411 result = TokenType::kFalseValue;
412 } else {
Alex Perrycb7da4b2019-08-28 19:35:56 -0700413 switch (object_type_.back()) {
414 case ObjectType::kObject:
415 if (Consume("}")) {
416 ConsumeWhitespace();
417 state_ = State::kExpectObjectEnd;
418 return Next();
419 }
420 break;
421 case ObjectType::kArray:
422 if (Consume("]")) {
423 ConsumeWhitespace();
424 state_ = State::kExpectArrayEnd;
425 return Next();
426 }
427 break;
428 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700429 // Couldn't parse, so we have a syntax error.
430 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
431 }
432
433 ConsumeWhitespace();
434
435 // After a field, we either have a , and another field (or value if we are
436 // in an array), or we should be closing out the object (or array).
437 if (Consume(",")) {
438 ConsumeWhitespace();
439 switch (object_type_.back()) {
440 case ObjectType::kObject:
441 state_ = State::kExpectField;
442 break;
443 case ObjectType::kArray:
444 state_ = State::kExpectValue;
445 break;
446 }
447 } else {
448 // Sanity check that the stack is deep enough.
449 if (object_type_.size() == 0) {
450 fprintf(stderr, "Error on line %d\n", linenumber_);
451 return TokenType::kError;
452 }
453
454 // And then require closing out the object.
455 switch (object_type_.back()) {
456 case ObjectType::kObject:
457 if (Consume("}")) {
458 ConsumeWhitespace();
459 state_ = State::kExpectObjectEnd;
460 } else {
Austin Schuh60e77942022-05-16 17:48:24 -0700461 fprintf(stderr, "Error on line %d, expected } or ,\n",
462 linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700463 return TokenType::kError;
464 }
465 break;
466 case ObjectType::kArray:
467 if (Consume("]")) {
468 ConsumeWhitespace();
469 state_ = State::kExpectArrayEnd;
470 } else {
Austin Schuh60e77942022-05-16 17:48:24 -0700471 fprintf(stderr, "Error on line %d, expected ] or ,\n",
472 linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700473 return TokenType::kError;
474 }
475 break;
476 }
477 }
478 return result;
479 } break;
480
481 case State::kExpectArrayEnd:
482 case State::kExpectObjectEnd: {
483 const TokenType result = state_ == State::kExpectArrayEnd
484 ? TokenType::kEndArray
485 : TokenType::kEndObject;
486 // This is a transient state so we can send 2 tokens out in a row. We
487 // discover the object or array end at the end of reading the value.
488 object_type_.pop_back();
489 if (object_type_.size() == 0) {
490 // We unwound the outer object. We should send kEnd next.
491 state_ = State::kExpectEnd;
492 } else if (object_type_.back() == ObjectType::kObject) {
493 // If we are going into an object, it should either have another field
494 // or end.
495 if (Consume(",")) {
496 ConsumeWhitespace();
497 state_ = State::kExpectField;
498 } else if (Consume("}")) {
499 ConsumeWhitespace();
500 state_ = State::kExpectObjectEnd;
501 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800502 fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700503 return TokenType::kError;
504 }
505 } else if (object_type_.back() == ObjectType::kArray) {
506 // If we are going into an array, it should either have another value
507 // or end.
508 if (Consume(",")) {
509 ConsumeWhitespace();
510 state_ = State::kExpectValue;
511 } else if (Consume("]")) {
512 ConsumeWhitespace();
513 state_ = State::kExpectArrayEnd;
514 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800515 fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700516 return TokenType::kError;
517 }
518 }
519 // And then send out the correct token.
520 return result;
521 }
522 case State::kExpectEnd:
523 // If we are supposed to be done, confirm nothing is after the end.
524 if (AtEnd()) {
525 return TokenType::kEnd;
526 } else {
527 fprintf(stderr, "Data past end at line %d\n", linenumber_);
528 return TokenType::kError;
529 }
530 }
531 return TokenType::kError;
532}
533
534bool Tokenizer::FieldAsInt(long long *value) {
535 const char *pos = field_value().c_str();
536 errno = 0;
537 *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
538 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
539 return false;
540 }
541 return true;
542}
543
544bool Tokenizer::FieldAsDouble(double *value) {
545 const char *pos = field_value().c_str();
546 errno = 0;
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800547 if (field_value() == "nan") {
548 *value = std::numeric_limits<double>::quiet_NaN();
549 return true;
550 } else if (field_value() == "-nan") {
551 *value = -std::numeric_limits<double>::quiet_NaN();
552 return true;
553 }
554
Austin Schuh9fa0b8e2021-03-21 19:21:50 -0700555 if (field_value() == "inf") {
556 *value = std::numeric_limits<double>::infinity();
557 return true;
558 } else if (field_value() == "-inf") {
559 *value = -std::numeric_limits<double>::infinity();
560 return true;
561 }
562
Austin Schuhd7e252d2019-10-06 13:51:02 -0700563 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
564
565 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
566 return false;
567 }
568 return true;
569}
570
571} // namespace aos