blob: 47fcdbe3efea0c69e4f5f0ff26ec6d15d6012b2f [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
Alex Perrycb7da4b2019-08-28 19:35:56 -07003#include <cerrno>
4
Austin Schuhd7e252d2019-10-06 13:51:02 -07005namespace aos {
6
7void Tokenizer::ConsumeWhitespace() {
8 while (true) {
9 if (AtEnd()) {
10 return;
11 }
12 // Skip any whitespace.
13 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
14 ConsumeChar();
15 } else if (Char() == '\n') {
16 ConsumeChar();
17 ++linenumber_;
Austin Schuh81da4b22019-10-06 14:03:24 -070018 } else if (Consume("/*")) {
19 while (!Consume("*/")) {
20 if (Char() == '\n') {
21 ++linenumber_;
22 }
23 ConsumeChar();
24 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070025 } else {
26 // There is no fail. Once we are out of whitespace (including 0 of it),
27 // declare success.
28 return;
29 }
30 }
31}
32
33bool Tokenizer::Consume(const char *token) {
James Kuszmaul3ae42262019-11-08 12:33:41 -080034 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070035 while (true) {
36 // Finishing the token is success.
37 if (*token == '\0') {
38 return true;
39 }
40
41 // But finishing the data first is failure.
42 if (AtEnd()) {
43 data_ = original;
44 return false;
45 }
46
47 // Missmatch is failure.
48 if (*token != Char()) {
49 data_ = original;
50 return false;
51 }
52
53 ConsumeChar();
54 ++token;
55 }
56}
57
58bool Tokenizer::ConsumeString(::std::string *s) {
59 // Under no conditions is it acceptible to run out of data while parsing a
60 // string. Any AtEnd checks should confirm that.
James Kuszmaul3ae42262019-11-08 12:33:41 -080061 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070062 if (AtEnd()) {
63 return false;
64 }
65
66 // Expect the leading "
67 if (Char() != '"') {
68 return false;
69 }
70
71 ConsumeChar();
James Kuszmaul3ae42262019-11-08 12:33:41 -080072 std::string_view last_parsed_data = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070073 *s = ::std::string();
74
75 while (true) {
76 if (AtEnd()) {
77 data_ = original;
78 return false;
79 }
80
81 // If we get an end or an escape, do something special.
82 if (Char() == '"' || Char() == '\\') {
83 // Save what we found up until now, not including this character.
84 *s += ::std::string(
85 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
86
87 // Update the pointer.
88 last_parsed_data = data_;
89
90 // " is the end, declare victory.
91 if (Char() == '"') {
92 ConsumeChar();
93 return true;
94 } else {
95 ConsumeChar();
96 // Now consume valid escape characters and add their representation onto
97 // the output string.
98 if (AtEnd()) {
99 data_ = original;
100 return false;
101 } else if (Char() == '"') {
102 *s += "\"";
103 } else if (Char() == '\\') {
104 *s += "\\";
105 } else if (Char() == '/') {
106 *s += "/";
107 } else if (Char() == 'b') {
108 *s += "\b";
109 } else if (Char() == 'f') {
110 *s += "\f";
111 } else if (Char() == 'n') {
112 *s += "\n";
113 } else if (Char() == 'r') {
114 *s += "\r";
115 } else if (Char() == 't') {
116 *s += "\t";
117 } else if (Char() == 'u') {
118 // TODO(austin): Unicode should be valid, but I really don't care to
119 // do this now...
120 fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
121 data_ = original;
122 return false;
123 }
124 }
125 // And skip the escaped character.
126 last_parsed_data = data_.substr(1);
127 }
128
129 ConsumeChar();
130 }
131}
132
133bool Tokenizer::ConsumeNumber(::std::string *s) {
134 // Under no conditions is it acceptible to run out of data while parsing a
135 // number. Any AtEnd() checks should confirm that.
136 *s = ::std::string();
James Kuszmaul3ae42262019-11-08 12:33:41 -0800137 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -0700138
139 // Consume the leading - unconditionally.
140 Consume("-");
141
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800142 // See if we find nan. This isn't standards compliant, but is what
143 // flatbuffers prints out, so we need to parse it.
144 if (Consume("nan")) {
145 *s = ::std::string(original.substr(0, original.size() - data_.size()));
146 return true;
147 }
148
Brian Silverman714b1d62020-04-28 16:52:54 -0700149 // People tend to use null instead of nan. Accept that too.
150 if (Consume("null")) {
151 *s = ::std::string("nan");
152 return true;
153 }
154
Austin Schuh9fa0b8e2021-03-21 19:21:50 -0700155 // Inf is also acceptable.
156 if (Consume("inf")) {
157 *s = ::std::string(original.substr(0, original.size() - data_.size()));
158 return true;
159 }
160
Austin Schuhd7e252d2019-10-06 13:51:02 -0700161 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
162 // by a second number.
163 if (!Consume("0")) {
164 if (AtEnd()) {
165 return false;
166 } else if (Char() >= '1' && Char() <= '9') {
167 // This wasn't a zero, but was a valid digit. Consume it.
168 ConsumeChar();
169 } else {
170 return false;
171 }
172
173 // Now consume any number of any digits.
174 while (true) {
175 if (AtEnd()) {
176 data_ = original;
177 return false;
178 }
179 if (Char() < '0' || Char() > '9') {
180 break;
181 }
182 ConsumeChar();
183 }
184 }
185
186 // We could now have a decimal.
187 if (Char() == '.') {
188 ConsumeChar();
189 while (true) {
190 if (AtEnd()) {
191 data_ = original;
192 return false;
193 }
194 // And any number of digits.
195 if (Char() < '0' || Char() > '9') {
196 break;
197 }
198 ConsumeChar();
199 }
200 }
201
202 // And now an exponent.
203 if (Char() == 'e' || Char() == 'E') {
204 ConsumeChar();
205 if (AtEnd()) {
206 data_ = original;
207 return false;
208 }
209
210 // Which could have a +-
211 if (Char() == '+' || Char() == '-') {
212 ConsumeChar();
213 }
214 int count = 0;
215 while (true) {
216 if (AtEnd()) {
217 data_ = original;
218 return false;
219 }
220 // And digits.
221 if (Char() < '0' || Char() > '9') {
222 break;
223 }
224 ConsumeChar();
225 ++count;
226 }
227 // But, it is an error to have an exponent and nothing following it.
228 if (count == 0) {
229 data_ = original;
230 return false;
231 }
232 }
233
234 *s = ::std::string(original.substr(0, original.size() - data_.size()));
235 return true;
236}
237
238Tokenizer::TokenType Tokenizer::Next() {
239 switch (state_) {
240 case State::kExpectObjectStart:
241 // We should always start out with a {
Austin Schuh6f896702020-03-19 16:07:20 -0700242 if (!Consume("{")) {
243 fprintf(stderr, "Error on line %d, expected { for start.\n",
244 linenumber_);
245 return TokenType::kError;
246 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700247
248 // Document that we just started an object.
249 object_type_.push_back(ObjectType::kObject);
250
251 ConsumeWhitespace();
252
253 if (Consume("}")) {
254 ConsumeWhitespace();
255 state_ = State::kExpectObjectEnd;
256 } else {
257 state_ = State::kExpectField;
258 }
259 return TokenType::kStartObject;
260
261 case State::kExpectField: {
262 // Fields are built up of strings, whitespace, and then a : (followed by
263 // whitespace...)
264 ::std::string s;
265 if (!ConsumeString(&s)) {
266 fprintf(stderr, "Error on line %d, expected string for field name.\n",
267 linenumber_);
Austin Schuh92700842019-12-28 13:00:17 -0800268 if (Consume("}")) {
269 fprintf(stderr,
270 "Got '}' instead. Did you add an extra trailing ','?\n");
271 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700272 return TokenType::kError;
273 }
274 field_name_ = ::std::move(s);
275
276 ConsumeWhitespace();
277
278 if (!Consume(":")) {
Austin Schuh2595a142020-11-29 22:43:57 -0800279 fprintf(stderr, "Error on line %d, expected ':', got '%c'\n",
280 linenumber_, Char());
Austin Schuhd7e252d2019-10-06 13:51:02 -0700281 return TokenType::kError;
282 }
283
284 ConsumeWhitespace();
285
286 state_ = State::kExpectValue;
287
288 return TokenType::kField;
289 } break;
290 case State::kExpectValue: {
291 TokenType result = TokenType::kError;
292
293 ::std::string s;
294 if (Consume("{")) {
295 // Fields are in objects. Record and recurse.
296 object_type_.push_back(ObjectType::kObject);
297
298 ConsumeWhitespace();
299
Alex Perrycb7da4b2019-08-28 19:35:56 -0700300 // And then if we encounter the end again, go to the end state.
301 if (Consume("}")) {
302 ConsumeWhitespace();
303 state_ = State::kExpectObjectEnd;
304 } else {
305 state_ = State::kExpectField;
306 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700307 return TokenType::kStartObject;
308 } else if (Consume("[")) {
309 // Values are in arrays. Record and recurse.
310 object_type_.push_back(ObjectType::kArray);
311
312 ConsumeWhitespace();
313 state_ = State::kExpectValue;
314 return TokenType::kStartArray;
315 } else if (ConsumeString(&s)) {
316 // Parsed as a string, grab it.
317 field_value_ = ::std::move(s);
318 result = TokenType::kStringValue;
319 } else if (ConsumeNumber(&s)) {
320 // Parsed as a number, grab it.
321 field_value_ = ::std::move(s);
322 result = TokenType::kNumberValue;
323 } else if (Consume("true")) {
324 // Parsed as a true, grab it.
325 field_value_ = "true";
326 result = TokenType::kTrueValue;
327 } else if (Consume("false")) {
328 // Parsed as a false, grab it.
329 field_value_ = "false";
330 result = TokenType::kFalseValue;
331 } else {
Alex Perrycb7da4b2019-08-28 19:35:56 -0700332 switch (object_type_.back()) {
333 case ObjectType::kObject:
334 if (Consume("}")) {
335 ConsumeWhitespace();
336 state_ = State::kExpectObjectEnd;
337 return Next();
338 }
339 break;
340 case ObjectType::kArray:
341 if (Consume("]")) {
342 ConsumeWhitespace();
343 state_ = State::kExpectArrayEnd;
344 return Next();
345 }
346 break;
347 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700348 // Couldn't parse, so we have a syntax error.
349 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
350 }
351
352 ConsumeWhitespace();
353
354 // After a field, we either have a , and another field (or value if we are
355 // in an array), or we should be closing out the object (or array).
356 if (Consume(",")) {
357 ConsumeWhitespace();
358 switch (object_type_.back()) {
359 case ObjectType::kObject:
360 state_ = State::kExpectField;
361 break;
362 case ObjectType::kArray:
363 state_ = State::kExpectValue;
364 break;
365 }
366 } else {
367 // Sanity check that the stack is deep enough.
368 if (object_type_.size() == 0) {
369 fprintf(stderr, "Error on line %d\n", linenumber_);
370 return TokenType::kError;
371 }
372
373 // And then require closing out the object.
374 switch (object_type_.back()) {
375 case ObjectType::kObject:
376 if (Consume("}")) {
377 ConsumeWhitespace();
378 state_ = State::kExpectObjectEnd;
379 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800380 fprintf(stderr, "Error on line %d, expected } or ,\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700381 return TokenType::kError;
382 }
383 break;
384 case ObjectType::kArray:
385 if (Consume("]")) {
386 ConsumeWhitespace();
387 state_ = State::kExpectArrayEnd;
388 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800389 fprintf(stderr, "Error on line %d, expected ] or ,\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700390 return TokenType::kError;
391 }
392 break;
393 }
394 }
395 return result;
396 } break;
397
398 case State::kExpectArrayEnd:
399 case State::kExpectObjectEnd: {
400 const TokenType result = state_ == State::kExpectArrayEnd
401 ? TokenType::kEndArray
402 : TokenType::kEndObject;
403 // This is a transient state so we can send 2 tokens out in a row. We
404 // discover the object or array end at the end of reading the value.
405 object_type_.pop_back();
406 if (object_type_.size() == 0) {
407 // We unwound the outer object. We should send kEnd next.
408 state_ = State::kExpectEnd;
409 } else if (object_type_.back() == ObjectType::kObject) {
410 // If we are going into an object, it should either have another field
411 // or end.
412 if (Consume(",")) {
413 ConsumeWhitespace();
414 state_ = State::kExpectField;
415 } else if (Consume("}")) {
416 ConsumeWhitespace();
417 state_ = State::kExpectObjectEnd;
418 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800419 fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700420 return TokenType::kError;
421 }
422 } else if (object_type_.back() == ObjectType::kArray) {
423 // If we are going into an array, it should either have another value
424 // or end.
425 if (Consume(",")) {
426 ConsumeWhitespace();
427 state_ = State::kExpectValue;
428 } else if (Consume("]")) {
429 ConsumeWhitespace();
430 state_ = State::kExpectArrayEnd;
431 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800432 fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700433 return TokenType::kError;
434 }
435 }
436 // And then send out the correct token.
437 return result;
438 }
439 case State::kExpectEnd:
440 // If we are supposed to be done, confirm nothing is after the end.
441 if (AtEnd()) {
442 return TokenType::kEnd;
443 } else {
444 fprintf(stderr, "Data past end at line %d\n", linenumber_);
445 return TokenType::kError;
446 }
447 }
448 return TokenType::kError;
449}
450
451bool Tokenizer::FieldAsInt(long long *value) {
452 const char *pos = field_value().c_str();
453 errno = 0;
454 *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
455 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
456 return false;
457 }
458 return true;
459}
460
461bool Tokenizer::FieldAsDouble(double *value) {
462 const char *pos = field_value().c_str();
463 errno = 0;
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800464 if (field_value() == "nan") {
465 *value = std::numeric_limits<double>::quiet_NaN();
466 return true;
467 } else if (field_value() == "-nan") {
468 *value = -std::numeric_limits<double>::quiet_NaN();
469 return true;
470 }
471
Austin Schuh9fa0b8e2021-03-21 19:21:50 -0700472 if (field_value() == "inf") {
473 *value = std::numeric_limits<double>::infinity();
474 return true;
475 } else if (field_value() == "-inf") {
476 *value = -std::numeric_limits<double>::infinity();
477 return true;
478 }
479
Austin Schuhd7e252d2019-10-06 13:51:02 -0700480 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
481
482 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
483 return false;
484 }
485 return true;
486}
487
488} // namespace aos