blob: a3d804e62256b3c643836ac07099cc694cc499c4 [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
Alex Perrycb7da4b2019-08-28 19:35:56 -07003#include <cerrno>
4
Austin Schuhd7e252d2019-10-06 13:51:02 -07005namespace aos {
6
7void Tokenizer::ConsumeWhitespace() {
8 while (true) {
9 if (AtEnd()) {
10 return;
11 }
12 // Skip any whitespace.
13 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
14 ConsumeChar();
15 } else if (Char() == '\n') {
16 ConsumeChar();
17 ++linenumber_;
Austin Schuh81da4b22019-10-06 14:03:24 -070018 } else if (Consume("/*")) {
19 while (!Consume("*/")) {
20 if (Char() == '\n') {
21 ++linenumber_;
22 }
23 ConsumeChar();
24 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070025 } else {
26 // There is no fail. Once we are out of whitespace (including 0 of it),
27 // declare success.
28 return;
29 }
30 }
31}
32
33bool Tokenizer::Consume(const char *token) {
James Kuszmaul3ae42262019-11-08 12:33:41 -080034 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070035 while (true) {
36 // Finishing the token is success.
37 if (*token == '\0') {
38 return true;
39 }
40
41 // But finishing the data first is failure.
42 if (AtEnd()) {
43 data_ = original;
44 return false;
45 }
46
47 // Missmatch is failure.
48 if (*token != Char()) {
49 data_ = original;
50 return false;
51 }
52
53 ConsumeChar();
54 ++token;
55 }
56}
57
58bool Tokenizer::ConsumeString(::std::string *s) {
59 // Under no conditions is it acceptible to run out of data while parsing a
60 // string. Any AtEnd checks should confirm that.
James Kuszmaul3ae42262019-11-08 12:33:41 -080061 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070062 if (AtEnd()) {
63 return false;
64 }
65
66 // Expect the leading "
67 if (Char() != '"') {
68 return false;
69 }
70
71 ConsumeChar();
James Kuszmaul3ae42262019-11-08 12:33:41 -080072 std::string_view last_parsed_data = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070073 *s = ::std::string();
74
75 while (true) {
76 if (AtEnd()) {
77 data_ = original;
78 return false;
79 }
80
81 // If we get an end or an escape, do something special.
82 if (Char() == '"' || Char() == '\\') {
83 // Save what we found up until now, not including this character.
84 *s += ::std::string(
85 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
86
87 // Update the pointer.
88 last_parsed_data = data_;
89
90 // " is the end, declare victory.
91 if (Char() == '"') {
92 ConsumeChar();
93 return true;
94 } else {
95 ConsumeChar();
96 // Now consume valid escape characters and add their representation onto
97 // the output string.
98 if (AtEnd()) {
99 data_ = original;
100 return false;
101 } else if (Char() == '"') {
102 *s += "\"";
103 } else if (Char() == '\\') {
104 *s += "\\";
105 } else if (Char() == '/') {
106 *s += "/";
107 } else if (Char() == 'b') {
108 *s += "\b";
109 } else if (Char() == 'f') {
110 *s += "\f";
111 } else if (Char() == 'n') {
112 *s += "\n";
113 } else if (Char() == 'r') {
114 *s += "\r";
115 } else if (Char() == 't') {
116 *s += "\t";
117 } else if (Char() == 'u') {
118 // TODO(austin): Unicode should be valid, but I really don't care to
119 // do this now...
120 fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
121 data_ = original;
122 return false;
123 }
124 }
125 // And skip the escaped character.
126 last_parsed_data = data_.substr(1);
127 }
128
129 ConsumeChar();
130 }
131}
132
133bool Tokenizer::ConsumeNumber(::std::string *s) {
134 // Under no conditions is it acceptible to run out of data while parsing a
135 // number. Any AtEnd() checks should confirm that.
136 *s = ::std::string();
James Kuszmaul3ae42262019-11-08 12:33:41 -0800137 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -0700138
139 // Consume the leading - unconditionally.
140 Consume("-");
141
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800142 // See if we find nan. This isn't standards compliant, but is what
143 // flatbuffers prints out, so we need to parse it.
144 if (Consume("nan")) {
145 *s = ::std::string(original.substr(0, original.size() - data_.size()));
146 return true;
147 }
148
Brian Silverman714b1d62020-04-28 16:52:54 -0700149 // People tend to use null instead of nan. Accept that too.
150 if (Consume("null")) {
151 *s = ::std::string("nan");
152 return true;
153 }
154
Austin Schuhd7e252d2019-10-06 13:51:02 -0700155 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
156 // by a second number.
157 if (!Consume("0")) {
158 if (AtEnd()) {
159 return false;
160 } else if (Char() >= '1' && Char() <= '9') {
161 // This wasn't a zero, but was a valid digit. Consume it.
162 ConsumeChar();
163 } else {
164 return false;
165 }
166
167 // Now consume any number of any digits.
168 while (true) {
169 if (AtEnd()) {
170 data_ = original;
171 return false;
172 }
173 if (Char() < '0' || Char() > '9') {
174 break;
175 }
176 ConsumeChar();
177 }
178 }
179
180 // We could now have a decimal.
181 if (Char() == '.') {
182 ConsumeChar();
183 while (true) {
184 if (AtEnd()) {
185 data_ = original;
186 return false;
187 }
188 // And any number of digits.
189 if (Char() < '0' || Char() > '9') {
190 break;
191 }
192 ConsumeChar();
193 }
194 }
195
196 // And now an exponent.
197 if (Char() == 'e' || Char() == 'E') {
198 ConsumeChar();
199 if (AtEnd()) {
200 data_ = original;
201 return false;
202 }
203
204 // Which could have a +-
205 if (Char() == '+' || Char() == '-') {
206 ConsumeChar();
207 }
208 int count = 0;
209 while (true) {
210 if (AtEnd()) {
211 data_ = original;
212 return false;
213 }
214 // And digits.
215 if (Char() < '0' || Char() > '9') {
216 break;
217 }
218 ConsumeChar();
219 ++count;
220 }
221 // But, it is an error to have an exponent and nothing following it.
222 if (count == 0) {
223 data_ = original;
224 return false;
225 }
226 }
227
228 *s = ::std::string(original.substr(0, original.size() - data_.size()));
229 return true;
230}
231
232Tokenizer::TokenType Tokenizer::Next() {
233 switch (state_) {
234 case State::kExpectObjectStart:
235 // We should always start out with a {
Austin Schuh6f896702020-03-19 16:07:20 -0700236 if (!Consume("{")) {
237 fprintf(stderr, "Error on line %d, expected { for start.\n",
238 linenumber_);
239 return TokenType::kError;
240 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700241
242 // Document that we just started an object.
243 object_type_.push_back(ObjectType::kObject);
244
245 ConsumeWhitespace();
246
247 if (Consume("}")) {
248 ConsumeWhitespace();
249 state_ = State::kExpectObjectEnd;
250 } else {
251 state_ = State::kExpectField;
252 }
253 return TokenType::kStartObject;
254
255 case State::kExpectField: {
256 // Fields are built up of strings, whitespace, and then a : (followed by
257 // whitespace...)
258 ::std::string s;
259 if (!ConsumeString(&s)) {
260 fprintf(stderr, "Error on line %d, expected string for field name.\n",
261 linenumber_);
Austin Schuh92700842019-12-28 13:00:17 -0800262 if (Consume("}")) {
263 fprintf(stderr,
264 "Got '}' instead. Did you add an extra trailing ','?\n");
265 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700266 return TokenType::kError;
267 }
268 field_name_ = ::std::move(s);
269
270 ConsumeWhitespace();
271
272 if (!Consume(":")) {
273 fprintf(stderr, "Error on line %d\n", linenumber_);
274 return TokenType::kError;
275 }
276
277 ConsumeWhitespace();
278
279 state_ = State::kExpectValue;
280
281 return TokenType::kField;
282 } break;
283 case State::kExpectValue: {
284 TokenType result = TokenType::kError;
285
286 ::std::string s;
287 if (Consume("{")) {
288 // Fields are in objects. Record and recurse.
289 object_type_.push_back(ObjectType::kObject);
290
291 ConsumeWhitespace();
292
Alex Perrycb7da4b2019-08-28 19:35:56 -0700293 // And then if we encounter the end again, go to the end state.
294 if (Consume("}")) {
295 ConsumeWhitespace();
296 state_ = State::kExpectObjectEnd;
297 } else {
298 state_ = State::kExpectField;
299 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700300 return TokenType::kStartObject;
301 } else if (Consume("[")) {
302 // Values are in arrays. Record and recurse.
303 object_type_.push_back(ObjectType::kArray);
304
305 ConsumeWhitespace();
306 state_ = State::kExpectValue;
307 return TokenType::kStartArray;
308 } else if (ConsumeString(&s)) {
309 // Parsed as a string, grab it.
310 field_value_ = ::std::move(s);
311 result = TokenType::kStringValue;
312 } else if (ConsumeNumber(&s)) {
313 // Parsed as a number, grab it.
314 field_value_ = ::std::move(s);
315 result = TokenType::kNumberValue;
316 } else if (Consume("true")) {
317 // Parsed as a true, grab it.
318 field_value_ = "true";
319 result = TokenType::kTrueValue;
320 } else if (Consume("false")) {
321 // Parsed as a false, grab it.
322 field_value_ = "false";
323 result = TokenType::kFalseValue;
324 } else {
Alex Perrycb7da4b2019-08-28 19:35:56 -0700325 switch (object_type_.back()) {
326 case ObjectType::kObject:
327 if (Consume("}")) {
328 ConsumeWhitespace();
329 state_ = State::kExpectObjectEnd;
330 return Next();
331 }
332 break;
333 case ObjectType::kArray:
334 if (Consume("]")) {
335 ConsumeWhitespace();
336 state_ = State::kExpectArrayEnd;
337 return Next();
338 }
339 break;
340 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700341 // Couldn't parse, so we have a syntax error.
342 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
343 }
344
345 ConsumeWhitespace();
346
347 // After a field, we either have a , and another field (or value if we are
348 // in an array), or we should be closing out the object (or array).
349 if (Consume(",")) {
350 ConsumeWhitespace();
351 switch (object_type_.back()) {
352 case ObjectType::kObject:
353 state_ = State::kExpectField;
354 break;
355 case ObjectType::kArray:
356 state_ = State::kExpectValue;
357 break;
358 }
359 } else {
360 // Sanity check that the stack is deep enough.
361 if (object_type_.size() == 0) {
362 fprintf(stderr, "Error on line %d\n", linenumber_);
363 return TokenType::kError;
364 }
365
366 // And then require closing out the object.
367 switch (object_type_.back()) {
368 case ObjectType::kObject:
369 if (Consume("}")) {
370 ConsumeWhitespace();
371 state_ = State::kExpectObjectEnd;
372 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800373 fprintf(stderr, "Error on line %d, expected } or ,\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700374 return TokenType::kError;
375 }
376 break;
377 case ObjectType::kArray:
378 if (Consume("]")) {
379 ConsumeWhitespace();
380 state_ = State::kExpectArrayEnd;
381 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800382 fprintf(stderr, "Error on line %d, expected ] or ,\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700383 return TokenType::kError;
384 }
385 break;
386 }
387 }
388 return result;
389 } break;
390
391 case State::kExpectArrayEnd:
392 case State::kExpectObjectEnd: {
393 const TokenType result = state_ == State::kExpectArrayEnd
394 ? TokenType::kEndArray
395 : TokenType::kEndObject;
396 // This is a transient state so we can send 2 tokens out in a row. We
397 // discover the object or array end at the end of reading the value.
398 object_type_.pop_back();
399 if (object_type_.size() == 0) {
400 // We unwound the outer object. We should send kEnd next.
401 state_ = State::kExpectEnd;
402 } else if (object_type_.back() == ObjectType::kObject) {
403 // If we are going into an object, it should either have another field
404 // or end.
405 if (Consume(",")) {
406 ConsumeWhitespace();
407 state_ = State::kExpectField;
408 } else if (Consume("}")) {
409 ConsumeWhitespace();
410 state_ = State::kExpectObjectEnd;
411 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800412 fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700413 return TokenType::kError;
414 }
415 } else if (object_type_.back() == ObjectType::kArray) {
416 // If we are going into an array, it should either have another value
417 // or end.
418 if (Consume(",")) {
419 ConsumeWhitespace();
420 state_ = State::kExpectValue;
421 } else if (Consume("]")) {
422 ConsumeWhitespace();
423 state_ = State::kExpectArrayEnd;
424 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800425 fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700426 return TokenType::kError;
427 }
428 }
429 // And then send out the correct token.
430 return result;
431 }
432 case State::kExpectEnd:
433 // If we are supposed to be done, confirm nothing is after the end.
434 if (AtEnd()) {
435 return TokenType::kEnd;
436 } else {
437 fprintf(stderr, "Data past end at line %d\n", linenumber_);
438 return TokenType::kError;
439 }
440 }
441 return TokenType::kError;
442}
443
444bool Tokenizer::FieldAsInt(long long *value) {
445 const char *pos = field_value().c_str();
446 errno = 0;
447 *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
448 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
449 return false;
450 }
451 return true;
452}
453
454bool Tokenizer::FieldAsDouble(double *value) {
455 const char *pos = field_value().c_str();
456 errno = 0;
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800457 if (field_value() == "nan") {
458 *value = std::numeric_limits<double>::quiet_NaN();
459 return true;
460 } else if (field_value() == "-nan") {
461 *value = -std::numeric_limits<double>::quiet_NaN();
462 return true;
463 }
464
Austin Schuhd7e252d2019-10-06 13:51:02 -0700465 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
466
467 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
468 return false;
469 }
470 return true;
471}
472
473} // namespace aos