blob: 9403daa729e01030c903585ecebe2d88bd758bd3 [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
Alex Perrycb7da4b2019-08-28 19:35:56 -07003#include <cerrno>
4
Austin Schuhd7e252d2019-10-06 13:51:02 -07005namespace aos {
6
7void Tokenizer::ConsumeWhitespace() {
8 while (true) {
9 if (AtEnd()) {
10 return;
11 }
12 // Skip any whitespace.
13 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
14 ConsumeChar();
15 } else if (Char() == '\n') {
16 ConsumeChar();
17 ++linenumber_;
Austin Schuh81da4b22019-10-06 14:03:24 -070018 } else if (Consume("/*")) {
19 while (!Consume("*/")) {
20 if (Char() == '\n') {
21 ++linenumber_;
22 }
23 ConsumeChar();
24 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070025 } else {
26 // There is no fail. Once we are out of whitespace (including 0 of it),
27 // declare success.
28 return;
29 }
30 }
31}
32
33bool Tokenizer::Consume(const char *token) {
James Kuszmaul3ae42262019-11-08 12:33:41 -080034 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070035 while (true) {
36 // Finishing the token is success.
37 if (*token == '\0') {
38 return true;
39 }
40
41 // But finishing the data first is failure.
42 if (AtEnd()) {
43 data_ = original;
44 return false;
45 }
46
47 // Missmatch is failure.
48 if (*token != Char()) {
49 data_ = original;
50 return false;
51 }
52
53 ConsumeChar();
54 ++token;
55 }
56}
57
58bool Tokenizer::ConsumeString(::std::string *s) {
59 // Under no conditions is it acceptible to run out of data while parsing a
60 // string. Any AtEnd checks should confirm that.
James Kuszmaul3ae42262019-11-08 12:33:41 -080061 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070062 if (AtEnd()) {
63 return false;
64 }
65
66 // Expect the leading "
67 if (Char() != '"') {
68 return false;
69 }
70
71 ConsumeChar();
James Kuszmaul3ae42262019-11-08 12:33:41 -080072 std::string_view last_parsed_data = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070073 *s = ::std::string();
74
75 while (true) {
76 if (AtEnd()) {
77 data_ = original;
78 return false;
79 }
80
81 // If we get an end or an escape, do something special.
82 if (Char() == '"' || Char() == '\\') {
83 // Save what we found up until now, not including this character.
84 *s += ::std::string(
85 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
86
87 // Update the pointer.
88 last_parsed_data = data_;
89
90 // " is the end, declare victory.
91 if (Char() == '"') {
92 ConsumeChar();
93 return true;
94 } else {
95 ConsumeChar();
96 // Now consume valid escape characters and add their representation onto
97 // the output string.
98 if (AtEnd()) {
99 data_ = original;
100 return false;
101 } else if (Char() == '"') {
102 *s += "\"";
103 } else if (Char() == '\\') {
104 *s += "\\";
105 } else if (Char() == '/') {
106 *s += "/";
107 } else if (Char() == 'b') {
108 *s += "\b";
109 } else if (Char() == 'f') {
110 *s += "\f";
111 } else if (Char() == 'n') {
112 *s += "\n";
113 } else if (Char() == 'r') {
114 *s += "\r";
115 } else if (Char() == 't') {
116 *s += "\t";
117 } else if (Char() == 'u') {
118 // TODO(austin): Unicode should be valid, but I really don't care to
119 // do this now...
120 fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
121 data_ = original;
122 return false;
123 }
124 }
125 // And skip the escaped character.
126 last_parsed_data = data_.substr(1);
127 }
128
129 ConsumeChar();
130 }
131}
132
133bool Tokenizer::ConsumeNumber(::std::string *s) {
134 // Under no conditions is it acceptible to run out of data while parsing a
135 // number. Any AtEnd() checks should confirm that.
136 *s = ::std::string();
James Kuszmaul3ae42262019-11-08 12:33:41 -0800137 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -0700138
139 // Consume the leading - unconditionally.
140 Consume("-");
141
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800142 // See if we find nan. This isn't standards compliant, but is what
143 // flatbuffers prints out, so we need to parse it.
144 if (Consume("nan")) {
145 *s = ::std::string(original.substr(0, original.size() - data_.size()));
146 return true;
147 }
148
Brian Silverman714b1d62020-04-28 16:52:54 -0700149 // People tend to use null instead of nan. Accept that too.
150 if (Consume("null")) {
151 *s = ::std::string("nan");
152 return true;
153 }
154
Austin Schuhd7e252d2019-10-06 13:51:02 -0700155 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
156 // by a second number.
157 if (!Consume("0")) {
158 if (AtEnd()) {
159 return false;
160 } else if (Char() >= '1' && Char() <= '9') {
161 // This wasn't a zero, but was a valid digit. Consume it.
162 ConsumeChar();
163 } else {
164 return false;
165 }
166
167 // Now consume any number of any digits.
168 while (true) {
169 if (AtEnd()) {
170 data_ = original;
171 return false;
172 }
173 if (Char() < '0' || Char() > '9') {
174 break;
175 }
176 ConsumeChar();
177 }
178 }
179
180 // We could now have a decimal.
181 if (Char() == '.') {
182 ConsumeChar();
183 while (true) {
184 if (AtEnd()) {
185 data_ = original;
186 return false;
187 }
188 // And any number of digits.
189 if (Char() < '0' || Char() > '9') {
190 break;
191 }
192 ConsumeChar();
193 }
194 }
195
196 // And now an exponent.
197 if (Char() == 'e' || Char() == 'E') {
198 ConsumeChar();
199 if (AtEnd()) {
200 data_ = original;
201 return false;
202 }
203
204 // Which could have a +-
205 if (Char() == '+' || Char() == '-') {
206 ConsumeChar();
207 }
208 int count = 0;
209 while (true) {
210 if (AtEnd()) {
211 data_ = original;
212 return false;
213 }
214 // And digits.
215 if (Char() < '0' || Char() > '9') {
216 break;
217 }
218 ConsumeChar();
219 ++count;
220 }
221 // But, it is an error to have an exponent and nothing following it.
222 if (count == 0) {
223 data_ = original;
224 return false;
225 }
226 }
227
228 *s = ::std::string(original.substr(0, original.size() - data_.size()));
229 return true;
230}
231
232Tokenizer::TokenType Tokenizer::Next() {
233 switch (state_) {
234 case State::kExpectObjectStart:
235 // We should always start out with a {
Austin Schuh6f896702020-03-19 16:07:20 -0700236 if (!Consume("{")) {
237 fprintf(stderr, "Error on line %d, expected { for start.\n",
238 linenumber_);
239 return TokenType::kError;
240 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700241
242 // Document that we just started an object.
243 object_type_.push_back(ObjectType::kObject);
244
245 ConsumeWhitespace();
246
247 if (Consume("}")) {
248 ConsumeWhitespace();
249 state_ = State::kExpectObjectEnd;
250 } else {
251 state_ = State::kExpectField;
252 }
253 return TokenType::kStartObject;
254
255 case State::kExpectField: {
256 // Fields are built up of strings, whitespace, and then a : (followed by
257 // whitespace...)
258 ::std::string s;
259 if (!ConsumeString(&s)) {
260 fprintf(stderr, "Error on line %d, expected string for field name.\n",
261 linenumber_);
Austin Schuh92700842019-12-28 13:00:17 -0800262 if (Consume("}")) {
263 fprintf(stderr,
264 "Got '}' instead. Did you add an extra trailing ','?\n");
265 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700266 return TokenType::kError;
267 }
268 field_name_ = ::std::move(s);
269
270 ConsumeWhitespace();
271
272 if (!Consume(":")) {
Austin Schuh2595a142020-11-29 22:43:57 -0800273 fprintf(stderr, "Error on line %d, expected ':', got '%c'\n",
274 linenumber_, Char());
Austin Schuhd7e252d2019-10-06 13:51:02 -0700275 return TokenType::kError;
276 }
277
278 ConsumeWhitespace();
279
280 state_ = State::kExpectValue;
281
282 return TokenType::kField;
283 } break;
284 case State::kExpectValue: {
285 TokenType result = TokenType::kError;
286
287 ::std::string s;
288 if (Consume("{")) {
289 // Fields are in objects. Record and recurse.
290 object_type_.push_back(ObjectType::kObject);
291
292 ConsumeWhitespace();
293
Alex Perrycb7da4b2019-08-28 19:35:56 -0700294 // And then if we encounter the end again, go to the end state.
295 if (Consume("}")) {
296 ConsumeWhitespace();
297 state_ = State::kExpectObjectEnd;
298 } else {
299 state_ = State::kExpectField;
300 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700301 return TokenType::kStartObject;
302 } else if (Consume("[")) {
303 // Values are in arrays. Record and recurse.
304 object_type_.push_back(ObjectType::kArray);
305
306 ConsumeWhitespace();
307 state_ = State::kExpectValue;
308 return TokenType::kStartArray;
309 } else if (ConsumeString(&s)) {
310 // Parsed as a string, grab it.
311 field_value_ = ::std::move(s);
312 result = TokenType::kStringValue;
313 } else if (ConsumeNumber(&s)) {
314 // Parsed as a number, grab it.
315 field_value_ = ::std::move(s);
316 result = TokenType::kNumberValue;
317 } else if (Consume("true")) {
318 // Parsed as a true, grab it.
319 field_value_ = "true";
320 result = TokenType::kTrueValue;
321 } else if (Consume("false")) {
322 // Parsed as a false, grab it.
323 field_value_ = "false";
324 result = TokenType::kFalseValue;
325 } else {
Alex Perrycb7da4b2019-08-28 19:35:56 -0700326 switch (object_type_.back()) {
327 case ObjectType::kObject:
328 if (Consume("}")) {
329 ConsumeWhitespace();
330 state_ = State::kExpectObjectEnd;
331 return Next();
332 }
333 break;
334 case ObjectType::kArray:
335 if (Consume("]")) {
336 ConsumeWhitespace();
337 state_ = State::kExpectArrayEnd;
338 return Next();
339 }
340 break;
341 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700342 // Couldn't parse, so we have a syntax error.
343 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
344 }
345
346 ConsumeWhitespace();
347
348 // After a field, we either have a , and another field (or value if we are
349 // in an array), or we should be closing out the object (or array).
350 if (Consume(",")) {
351 ConsumeWhitespace();
352 switch (object_type_.back()) {
353 case ObjectType::kObject:
354 state_ = State::kExpectField;
355 break;
356 case ObjectType::kArray:
357 state_ = State::kExpectValue;
358 break;
359 }
360 } else {
361 // Sanity check that the stack is deep enough.
362 if (object_type_.size() == 0) {
363 fprintf(stderr, "Error on line %d\n", linenumber_);
364 return TokenType::kError;
365 }
366
367 // And then require closing out the object.
368 switch (object_type_.back()) {
369 case ObjectType::kObject:
370 if (Consume("}")) {
371 ConsumeWhitespace();
372 state_ = State::kExpectObjectEnd;
373 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800374 fprintf(stderr, "Error on line %d, expected } or ,\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700375 return TokenType::kError;
376 }
377 break;
378 case ObjectType::kArray:
379 if (Consume("]")) {
380 ConsumeWhitespace();
381 state_ = State::kExpectArrayEnd;
382 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800383 fprintf(stderr, "Error on line %d, expected ] or ,\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700384 return TokenType::kError;
385 }
386 break;
387 }
388 }
389 return result;
390 } break;
391
392 case State::kExpectArrayEnd:
393 case State::kExpectObjectEnd: {
394 const TokenType result = state_ == State::kExpectArrayEnd
395 ? TokenType::kEndArray
396 : TokenType::kEndObject;
397 // This is a transient state so we can send 2 tokens out in a row. We
398 // discover the object or array end at the end of reading the value.
399 object_type_.pop_back();
400 if (object_type_.size() == 0) {
401 // We unwound the outer object. We should send kEnd next.
402 state_ = State::kExpectEnd;
403 } else if (object_type_.back() == ObjectType::kObject) {
404 // If we are going into an object, it should either have another field
405 // or end.
406 if (Consume(",")) {
407 ConsumeWhitespace();
408 state_ = State::kExpectField;
409 } else if (Consume("}")) {
410 ConsumeWhitespace();
411 state_ = State::kExpectObjectEnd;
412 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800413 fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700414 return TokenType::kError;
415 }
416 } else if (object_type_.back() == ObjectType::kArray) {
417 // If we are going into an array, it should either have another value
418 // or end.
419 if (Consume(",")) {
420 ConsumeWhitespace();
421 state_ = State::kExpectValue;
422 } else if (Consume("]")) {
423 ConsumeWhitespace();
424 state_ = State::kExpectArrayEnd;
425 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800426 fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700427 return TokenType::kError;
428 }
429 }
430 // And then send out the correct token.
431 return result;
432 }
433 case State::kExpectEnd:
434 // If we are supposed to be done, confirm nothing is after the end.
435 if (AtEnd()) {
436 return TokenType::kEnd;
437 } else {
438 fprintf(stderr, "Data past end at line %d\n", linenumber_);
439 return TokenType::kError;
440 }
441 }
442 return TokenType::kError;
443}
444
445bool Tokenizer::FieldAsInt(long long *value) {
446 const char *pos = field_value().c_str();
447 errno = 0;
448 *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
449 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
450 return false;
451 }
452 return true;
453}
454
455bool Tokenizer::FieldAsDouble(double *value) {
456 const char *pos = field_value().c_str();
457 errno = 0;
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800458 if (field_value() == "nan") {
459 *value = std::numeric_limits<double>::quiet_NaN();
460 return true;
461 } else if (field_value() == "-nan") {
462 *value = -std::numeric_limits<double>::quiet_NaN();
463 return true;
464 }
465
Austin Schuhd7e252d2019-10-06 13:51:02 -0700466 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
467
468 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
469 return false;
470 }
471 return true;
472}
473
474} // namespace aos