blob: 0e235e6b5f7e0cfcda74d67a86545d0bf5c24a51 [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
Alex Perrycb7da4b2019-08-28 19:35:56 -07003#include <cerrno>
Brian Silverman4c7235a2021-11-17 19:04:37 -08004#include <limits>
Alex Perrycb7da4b2019-08-28 19:35:56 -07005
Austin Schuhd7e252d2019-10-06 13:51:02 -07006namespace aos {
7
8void Tokenizer::ConsumeWhitespace() {
9 while (true) {
10 if (AtEnd()) {
11 return;
12 }
13 // Skip any whitespace.
14 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
15 ConsumeChar();
16 } else if (Char() == '\n') {
17 ConsumeChar();
18 ++linenumber_;
Austin Schuh81da4b22019-10-06 14:03:24 -070019 } else if (Consume("/*")) {
20 while (!Consume("*/")) {
21 if (Char() == '\n') {
22 ++linenumber_;
23 }
24 ConsumeChar();
25 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070026 } else {
27 // There is no fail. Once we are out of whitespace (including 0 of it),
28 // declare success.
29 return;
30 }
31 }
32}
33
34bool Tokenizer::Consume(const char *token) {
James Kuszmaul3ae42262019-11-08 12:33:41 -080035 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070036 while (true) {
37 // Finishing the token is success.
38 if (*token == '\0') {
39 return true;
40 }
41
42 // But finishing the data first is failure.
43 if (AtEnd()) {
44 data_ = original;
45 return false;
46 }
47
48 // Missmatch is failure.
49 if (*token != Char()) {
50 data_ = original;
51 return false;
52 }
53
54 ConsumeChar();
55 ++token;
56 }
57}
58
59bool Tokenizer::ConsumeString(::std::string *s) {
60 // Under no conditions is it acceptible to run out of data while parsing a
61 // string. Any AtEnd checks should confirm that.
James Kuszmaul3ae42262019-11-08 12:33:41 -080062 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070063 if (AtEnd()) {
64 return false;
65 }
66
67 // Expect the leading "
68 if (Char() != '"') {
69 return false;
70 }
71
72 ConsumeChar();
James Kuszmaul3ae42262019-11-08 12:33:41 -080073 std::string_view last_parsed_data = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070074 *s = ::std::string();
75
76 while (true) {
77 if (AtEnd()) {
78 data_ = original;
79 return false;
80 }
81
82 // If we get an end or an escape, do something special.
83 if (Char() == '"' || Char() == '\\') {
84 // Save what we found up until now, not including this character.
85 *s += ::std::string(
86 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
87
88 // Update the pointer.
89 last_parsed_data = data_;
90
91 // " is the end, declare victory.
92 if (Char() == '"') {
93 ConsumeChar();
94 return true;
95 } else {
96 ConsumeChar();
97 // Now consume valid escape characters and add their representation onto
98 // the output string.
99 if (AtEnd()) {
100 data_ = original;
101 return false;
102 } else if (Char() == '"') {
103 *s += "\"";
104 } else if (Char() == '\\') {
105 *s += "\\";
106 } else if (Char() == '/') {
107 *s += "/";
108 } else if (Char() == 'b') {
109 *s += "\b";
110 } else if (Char() == 'f') {
111 *s += "\f";
112 } else if (Char() == 'n') {
113 *s += "\n";
114 } else if (Char() == 'r') {
115 *s += "\r";
116 } else if (Char() == 't') {
117 *s += "\t";
118 } else if (Char() == 'u') {
119 // TODO(austin): Unicode should be valid, but I really don't care to
120 // do this now...
121 fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
122 data_ = original;
123 return false;
124 }
125 }
126 // And skip the escaped character.
127 last_parsed_data = data_.substr(1);
128 }
129
130 ConsumeChar();
131 }
132}
133
134bool Tokenizer::ConsumeNumber(::std::string *s) {
135 // Under no conditions is it acceptible to run out of data while parsing a
136 // number. Any AtEnd() checks should confirm that.
137 *s = ::std::string();
James Kuszmaul3ae42262019-11-08 12:33:41 -0800138 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -0700139
140 // Consume the leading - unconditionally.
141 Consume("-");
142
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800143 // See if we find nan. This isn't standards compliant, but is what
144 // flatbuffers prints out, so we need to parse it.
145 if (Consume("nan")) {
146 *s = ::std::string(original.substr(0, original.size() - data_.size()));
147 return true;
148 }
149
Brian Silverman714b1d62020-04-28 16:52:54 -0700150 // People tend to use null instead of nan. Accept that too.
151 if (Consume("null")) {
152 *s = ::std::string("nan");
153 return true;
154 }
155
Austin Schuh9fa0b8e2021-03-21 19:21:50 -0700156 // Inf is also acceptable.
157 if (Consume("inf")) {
158 *s = ::std::string(original.substr(0, original.size() - data_.size()));
159 return true;
160 }
161
Austin Schuhd7e252d2019-10-06 13:51:02 -0700162 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
163 // by a second number.
164 if (!Consume("0")) {
165 if (AtEnd()) {
166 return false;
167 } else if (Char() >= '1' && Char() <= '9') {
168 // This wasn't a zero, but was a valid digit. Consume it.
169 ConsumeChar();
170 } else {
171 return false;
172 }
173
174 // Now consume any number of any digits.
175 while (true) {
176 if (AtEnd()) {
177 data_ = original;
178 return false;
179 }
180 if (Char() < '0' || Char() > '9') {
181 break;
182 }
183 ConsumeChar();
184 }
185 }
186
187 // We could now have a decimal.
188 if (Char() == '.') {
189 ConsumeChar();
190 while (true) {
191 if (AtEnd()) {
192 data_ = original;
193 return false;
194 }
195 // And any number of digits.
196 if (Char() < '0' || Char() > '9') {
197 break;
198 }
199 ConsumeChar();
200 }
201 }
202
203 // And now an exponent.
204 if (Char() == 'e' || Char() == 'E') {
205 ConsumeChar();
206 if (AtEnd()) {
207 data_ = original;
208 return false;
209 }
210
211 // Which could have a +-
212 if (Char() == '+' || Char() == '-') {
213 ConsumeChar();
214 }
215 int count = 0;
216 while (true) {
217 if (AtEnd()) {
218 data_ = original;
219 return false;
220 }
221 // And digits.
222 if (Char() < '0' || Char() > '9') {
223 break;
224 }
225 ConsumeChar();
226 ++count;
227 }
228 // But, it is an error to have an exponent and nothing following it.
229 if (count == 0) {
230 data_ = original;
231 return false;
232 }
233 }
234
235 *s = ::std::string(original.substr(0, original.size() - data_.size()));
236 return true;
237}
238
239Tokenizer::TokenType Tokenizer::Next() {
240 switch (state_) {
241 case State::kExpectObjectStart:
242 // We should always start out with a {
Austin Schuh6f896702020-03-19 16:07:20 -0700243 if (!Consume("{")) {
244 fprintf(stderr, "Error on line %d, expected { for start.\n",
245 linenumber_);
246 return TokenType::kError;
247 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700248
249 // Document that we just started an object.
250 object_type_.push_back(ObjectType::kObject);
251
252 ConsumeWhitespace();
253
254 if (Consume("}")) {
255 ConsumeWhitespace();
256 state_ = State::kExpectObjectEnd;
257 } else {
258 state_ = State::kExpectField;
259 }
260 return TokenType::kStartObject;
261
262 case State::kExpectField: {
263 // Fields are built up of strings, whitespace, and then a : (followed by
264 // whitespace...)
265 ::std::string s;
266 if (!ConsumeString(&s)) {
267 fprintf(stderr, "Error on line %d, expected string for field name.\n",
268 linenumber_);
Austin Schuh92700842019-12-28 13:00:17 -0800269 if (Consume("}")) {
270 fprintf(stderr,
271 "Got '}' instead. Did you add an extra trailing ','?\n");
272 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700273 return TokenType::kError;
274 }
275 field_name_ = ::std::move(s);
276
277 ConsumeWhitespace();
278
279 if (!Consume(":")) {
Austin Schuh2595a142020-11-29 22:43:57 -0800280 fprintf(stderr, "Error on line %d, expected ':', got '%c'\n",
281 linenumber_, Char());
Austin Schuhd7e252d2019-10-06 13:51:02 -0700282 return TokenType::kError;
283 }
284
285 ConsumeWhitespace();
286
287 state_ = State::kExpectValue;
288
289 return TokenType::kField;
290 } break;
291 case State::kExpectValue: {
292 TokenType result = TokenType::kError;
293
294 ::std::string s;
295 if (Consume("{")) {
296 // Fields are in objects. Record and recurse.
297 object_type_.push_back(ObjectType::kObject);
298
299 ConsumeWhitespace();
300
Alex Perrycb7da4b2019-08-28 19:35:56 -0700301 // And then if we encounter the end again, go to the end state.
302 if (Consume("}")) {
303 ConsumeWhitespace();
304 state_ = State::kExpectObjectEnd;
305 } else {
306 state_ = State::kExpectField;
307 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700308 return TokenType::kStartObject;
309 } else if (Consume("[")) {
310 // Values are in arrays. Record and recurse.
311 object_type_.push_back(ObjectType::kArray);
312
313 ConsumeWhitespace();
314 state_ = State::kExpectValue;
315 return TokenType::kStartArray;
316 } else if (ConsumeString(&s)) {
317 // Parsed as a string, grab it.
318 field_value_ = ::std::move(s);
319 result = TokenType::kStringValue;
320 } else if (ConsumeNumber(&s)) {
321 // Parsed as a number, grab it.
322 field_value_ = ::std::move(s);
323 result = TokenType::kNumberValue;
324 } else if (Consume("true")) {
325 // Parsed as a true, grab it.
326 field_value_ = "true";
327 result = TokenType::kTrueValue;
328 } else if (Consume("false")) {
329 // Parsed as a false, grab it.
330 field_value_ = "false";
331 result = TokenType::kFalseValue;
332 } else {
Alex Perrycb7da4b2019-08-28 19:35:56 -0700333 switch (object_type_.back()) {
334 case ObjectType::kObject:
335 if (Consume("}")) {
336 ConsumeWhitespace();
337 state_ = State::kExpectObjectEnd;
338 return Next();
339 }
340 break;
341 case ObjectType::kArray:
342 if (Consume("]")) {
343 ConsumeWhitespace();
344 state_ = State::kExpectArrayEnd;
345 return Next();
346 }
347 break;
348 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700349 // Couldn't parse, so we have a syntax error.
350 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
351 }
352
353 ConsumeWhitespace();
354
355 // After a field, we either have a , and another field (or value if we are
356 // in an array), or we should be closing out the object (or array).
357 if (Consume(",")) {
358 ConsumeWhitespace();
359 switch (object_type_.back()) {
360 case ObjectType::kObject:
361 state_ = State::kExpectField;
362 break;
363 case ObjectType::kArray:
364 state_ = State::kExpectValue;
365 break;
366 }
367 } else {
368 // Sanity check that the stack is deep enough.
369 if (object_type_.size() == 0) {
370 fprintf(stderr, "Error on line %d\n", linenumber_);
371 return TokenType::kError;
372 }
373
374 // And then require closing out the object.
375 switch (object_type_.back()) {
376 case ObjectType::kObject:
377 if (Consume("}")) {
378 ConsumeWhitespace();
379 state_ = State::kExpectObjectEnd;
380 } else {
Austin Schuh60e77942022-05-16 17:48:24 -0700381 fprintf(stderr, "Error on line %d, expected } or ,\n",
382 linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700383 return TokenType::kError;
384 }
385 break;
386 case ObjectType::kArray:
387 if (Consume("]")) {
388 ConsumeWhitespace();
389 state_ = State::kExpectArrayEnd;
390 } else {
Austin Schuh60e77942022-05-16 17:48:24 -0700391 fprintf(stderr, "Error on line %d, expected ] or ,\n",
392 linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700393 return TokenType::kError;
394 }
395 break;
396 }
397 }
398 return result;
399 } break;
400
401 case State::kExpectArrayEnd:
402 case State::kExpectObjectEnd: {
403 const TokenType result = state_ == State::kExpectArrayEnd
404 ? TokenType::kEndArray
405 : TokenType::kEndObject;
406 // This is a transient state so we can send 2 tokens out in a row. We
407 // discover the object or array end at the end of reading the value.
408 object_type_.pop_back();
409 if (object_type_.size() == 0) {
410 // We unwound the outer object. We should send kEnd next.
411 state_ = State::kExpectEnd;
412 } else if (object_type_.back() == ObjectType::kObject) {
413 // If we are going into an object, it should either have another field
414 // or end.
415 if (Consume(",")) {
416 ConsumeWhitespace();
417 state_ = State::kExpectField;
418 } else if (Consume("}")) {
419 ConsumeWhitespace();
420 state_ = State::kExpectObjectEnd;
421 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800422 fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700423 return TokenType::kError;
424 }
425 } else if (object_type_.back() == ObjectType::kArray) {
426 // If we are going into an array, it should either have another value
427 // or end.
428 if (Consume(",")) {
429 ConsumeWhitespace();
430 state_ = State::kExpectValue;
431 } else if (Consume("]")) {
432 ConsumeWhitespace();
433 state_ = State::kExpectArrayEnd;
434 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800435 fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700436 return TokenType::kError;
437 }
438 }
439 // And then send out the correct token.
440 return result;
441 }
442 case State::kExpectEnd:
443 // If we are supposed to be done, confirm nothing is after the end.
444 if (AtEnd()) {
445 return TokenType::kEnd;
446 } else {
447 fprintf(stderr, "Data past end at line %d\n", linenumber_);
448 return TokenType::kError;
449 }
450 }
451 return TokenType::kError;
452}
453
454bool Tokenizer::FieldAsInt(long long *value) {
455 const char *pos = field_value().c_str();
456 errno = 0;
457 *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
458 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
459 return false;
460 }
461 return true;
462}
463
464bool Tokenizer::FieldAsDouble(double *value) {
465 const char *pos = field_value().c_str();
466 errno = 0;
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800467 if (field_value() == "nan") {
468 *value = std::numeric_limits<double>::quiet_NaN();
469 return true;
470 } else if (field_value() == "-nan") {
471 *value = -std::numeric_limits<double>::quiet_NaN();
472 return true;
473 }
474
Austin Schuh9fa0b8e2021-03-21 19:21:50 -0700475 if (field_value() == "inf") {
476 *value = std::numeric_limits<double>::infinity();
477 return true;
478 } else if (field_value() == "-inf") {
479 *value = -std::numeric_limits<double>::infinity();
480 return true;
481 }
482
Austin Schuhd7e252d2019-10-06 13:51:02 -0700483 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
484
485 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
486 return false;
487 }
488 return true;
489}
490
491} // namespace aos