blob: c54e0ed4d89e665e80a4348a4760ec6d92cd4328 [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
Alex Perrycb7da4b2019-08-28 19:35:56 -07003#include <cerrno>
4
Austin Schuhd7e252d2019-10-06 13:51:02 -07005namespace aos {
6
7void Tokenizer::ConsumeWhitespace() {
8 while (true) {
9 if (AtEnd()) {
10 return;
11 }
12 // Skip any whitespace.
13 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
14 ConsumeChar();
15 } else if (Char() == '\n') {
16 ConsumeChar();
17 ++linenumber_;
Austin Schuh81da4b22019-10-06 14:03:24 -070018 } else if (Consume("/*")) {
19 while (!Consume("*/")) {
20 if (Char() == '\n') {
21 ++linenumber_;
22 }
23 ConsumeChar();
24 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070025 } else {
26 // There is no fail. Once we are out of whitespace (including 0 of it),
27 // declare success.
28 return;
29 }
30 }
31}
32
33bool Tokenizer::Consume(const char *token) {
James Kuszmaul3ae42262019-11-08 12:33:41 -080034 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070035 while (true) {
36 // Finishing the token is success.
37 if (*token == '\0') {
38 return true;
39 }
40
41 // But finishing the data first is failure.
42 if (AtEnd()) {
43 data_ = original;
44 return false;
45 }
46
47 // Missmatch is failure.
48 if (*token != Char()) {
49 data_ = original;
50 return false;
51 }
52
53 ConsumeChar();
54 ++token;
55 }
56}
57
58bool Tokenizer::ConsumeString(::std::string *s) {
59 // Under no conditions is it acceptible to run out of data while parsing a
60 // string. Any AtEnd checks should confirm that.
James Kuszmaul3ae42262019-11-08 12:33:41 -080061 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070062 if (AtEnd()) {
63 return false;
64 }
65
66 // Expect the leading "
67 if (Char() != '"') {
68 return false;
69 }
70
71 ConsumeChar();
James Kuszmaul3ae42262019-11-08 12:33:41 -080072 std::string_view last_parsed_data = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070073 *s = ::std::string();
74
75 while (true) {
76 if (AtEnd()) {
77 data_ = original;
78 return false;
79 }
80
81 // If we get an end or an escape, do something special.
82 if (Char() == '"' || Char() == '\\') {
83 // Save what we found up until now, not including this character.
84 *s += ::std::string(
85 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
86
87 // Update the pointer.
88 last_parsed_data = data_;
89
90 // " is the end, declare victory.
91 if (Char() == '"') {
92 ConsumeChar();
93 return true;
94 } else {
95 ConsumeChar();
96 // Now consume valid escape characters and add their representation onto
97 // the output string.
98 if (AtEnd()) {
99 data_ = original;
100 return false;
101 } else if (Char() == '"') {
102 *s += "\"";
103 } else if (Char() == '\\') {
104 *s += "\\";
105 } else if (Char() == '/') {
106 *s += "/";
107 } else if (Char() == 'b') {
108 *s += "\b";
109 } else if (Char() == 'f') {
110 *s += "\f";
111 } else if (Char() == 'n') {
112 *s += "\n";
113 } else if (Char() == 'r') {
114 *s += "\r";
115 } else if (Char() == 't') {
116 *s += "\t";
117 } else if (Char() == 'u') {
118 // TODO(austin): Unicode should be valid, but I really don't care to
119 // do this now...
120 fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
121 data_ = original;
122 return false;
123 }
124 }
125 // And skip the escaped character.
126 last_parsed_data = data_.substr(1);
127 }
128
129 ConsumeChar();
130 }
131}
132
133bool Tokenizer::ConsumeNumber(::std::string *s) {
134 // Under no conditions is it acceptible to run out of data while parsing a
135 // number. Any AtEnd() checks should confirm that.
136 *s = ::std::string();
James Kuszmaul3ae42262019-11-08 12:33:41 -0800137 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -0700138
139 // Consume the leading - unconditionally.
140 Consume("-");
141
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800142 // See if we find nan. This isn't standards compliant, but is what
143 // flatbuffers prints out, so we need to parse it.
144 if (Consume("nan")) {
145 *s = ::std::string(original.substr(0, original.size() - data_.size()));
146 return true;
147 }
148
Austin Schuhd7e252d2019-10-06 13:51:02 -0700149 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
150 // by a second number.
151 if (!Consume("0")) {
152 if (AtEnd()) {
153 return false;
154 } else if (Char() >= '1' && Char() <= '9') {
155 // This wasn't a zero, but was a valid digit. Consume it.
156 ConsumeChar();
157 } else {
158 return false;
159 }
160
161 // Now consume any number of any digits.
162 while (true) {
163 if (AtEnd()) {
164 data_ = original;
165 return false;
166 }
167 if (Char() < '0' || Char() > '9') {
168 break;
169 }
170 ConsumeChar();
171 }
172 }
173
174 // We could now have a decimal.
175 if (Char() == '.') {
176 ConsumeChar();
177 while (true) {
178 if (AtEnd()) {
179 data_ = original;
180 return false;
181 }
182 // And any number of digits.
183 if (Char() < '0' || Char() > '9') {
184 break;
185 }
186 ConsumeChar();
187 }
188 }
189
190 // And now an exponent.
191 if (Char() == 'e' || Char() == 'E') {
192 ConsumeChar();
193 if (AtEnd()) {
194 data_ = original;
195 return false;
196 }
197
198 // Which could have a +-
199 if (Char() == '+' || Char() == '-') {
200 ConsumeChar();
201 }
202 int count = 0;
203 while (true) {
204 if (AtEnd()) {
205 data_ = original;
206 return false;
207 }
208 // And digits.
209 if (Char() < '0' || Char() > '9') {
210 break;
211 }
212 ConsumeChar();
213 ++count;
214 }
215 // But, it is an error to have an exponent and nothing following it.
216 if (count == 0) {
217 data_ = original;
218 return false;
219 }
220 }
221
222 *s = ::std::string(original.substr(0, original.size() - data_.size()));
223 return true;
224}
225
226Tokenizer::TokenType Tokenizer::Next() {
227 switch (state_) {
228 case State::kExpectObjectStart:
229 // We should always start out with a {
Austin Schuh6f896702020-03-19 16:07:20 -0700230 if (!Consume("{")) {
231 fprintf(stderr, "Error on line %d, expected { for start.\n",
232 linenumber_);
233 return TokenType::kError;
234 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700235
236 // Document that we just started an object.
237 object_type_.push_back(ObjectType::kObject);
238
239 ConsumeWhitespace();
240
241 if (Consume("}")) {
242 ConsumeWhitespace();
243 state_ = State::kExpectObjectEnd;
244 } else {
245 state_ = State::kExpectField;
246 }
247 return TokenType::kStartObject;
248
249 case State::kExpectField: {
250 // Fields are built up of strings, whitespace, and then a : (followed by
251 // whitespace...)
252 ::std::string s;
253 if (!ConsumeString(&s)) {
254 fprintf(stderr, "Error on line %d, expected string for field name.\n",
255 linenumber_);
Austin Schuh92700842019-12-28 13:00:17 -0800256 if (Consume("}")) {
257 fprintf(stderr,
258 "Got '}' instead. Did you add an extra trailing ','?\n");
259 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700260 return TokenType::kError;
261 }
262 field_name_ = ::std::move(s);
263
264 ConsumeWhitespace();
265
266 if (!Consume(":")) {
267 fprintf(stderr, "Error on line %d\n", linenumber_);
268 return TokenType::kError;
269 }
270
271 ConsumeWhitespace();
272
273 state_ = State::kExpectValue;
274
275 return TokenType::kField;
276 } break;
277 case State::kExpectValue: {
278 TokenType result = TokenType::kError;
279
280 ::std::string s;
281 if (Consume("{")) {
282 // Fields are in objects. Record and recurse.
283 object_type_.push_back(ObjectType::kObject);
284
285 ConsumeWhitespace();
286
Alex Perrycb7da4b2019-08-28 19:35:56 -0700287 // And then if we encounter the end again, go to the end state.
288 if (Consume("}")) {
289 ConsumeWhitespace();
290 state_ = State::kExpectObjectEnd;
291 } else {
292 state_ = State::kExpectField;
293 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700294 return TokenType::kStartObject;
295 } else if (Consume("[")) {
296 // Values are in arrays. Record and recurse.
297 object_type_.push_back(ObjectType::kArray);
298
299 ConsumeWhitespace();
300 state_ = State::kExpectValue;
301 return TokenType::kStartArray;
302 } else if (ConsumeString(&s)) {
303 // Parsed as a string, grab it.
304 field_value_ = ::std::move(s);
305 result = TokenType::kStringValue;
306 } else if (ConsumeNumber(&s)) {
307 // Parsed as a number, grab it.
308 field_value_ = ::std::move(s);
309 result = TokenType::kNumberValue;
310 } else if (Consume("true")) {
311 // Parsed as a true, grab it.
312 field_value_ = "true";
313 result = TokenType::kTrueValue;
314 } else if (Consume("false")) {
315 // Parsed as a false, grab it.
316 field_value_ = "false";
317 result = TokenType::kFalseValue;
318 } else {
Alex Perrycb7da4b2019-08-28 19:35:56 -0700319 switch (object_type_.back()) {
320 case ObjectType::kObject:
321 if (Consume("}")) {
322 ConsumeWhitespace();
323 state_ = State::kExpectObjectEnd;
324 return Next();
325 }
326 break;
327 case ObjectType::kArray:
328 if (Consume("]")) {
329 ConsumeWhitespace();
330 state_ = State::kExpectArrayEnd;
331 return Next();
332 }
333 break;
334 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700335 // Couldn't parse, so we have a syntax error.
336 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
337 }
338
339 ConsumeWhitespace();
340
341 // After a field, we either have a , and another field (or value if we are
342 // in an array), or we should be closing out the object (or array).
343 if (Consume(",")) {
344 ConsumeWhitespace();
345 switch (object_type_.back()) {
346 case ObjectType::kObject:
347 state_ = State::kExpectField;
348 break;
349 case ObjectType::kArray:
350 state_ = State::kExpectValue;
351 break;
352 }
353 } else {
354 // Sanity check that the stack is deep enough.
355 if (object_type_.size() == 0) {
356 fprintf(stderr, "Error on line %d\n", linenumber_);
357 return TokenType::kError;
358 }
359
360 // And then require closing out the object.
361 switch (object_type_.back()) {
362 case ObjectType::kObject:
363 if (Consume("}")) {
364 ConsumeWhitespace();
365 state_ = State::kExpectObjectEnd;
366 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800367 fprintf(stderr, "Error on line %d, expected } or ,\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700368 return TokenType::kError;
369 }
370 break;
371 case ObjectType::kArray:
372 if (Consume("]")) {
373 ConsumeWhitespace();
374 state_ = State::kExpectArrayEnd;
375 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800376 fprintf(stderr, "Error on line %d, expected ] or ,\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700377 return TokenType::kError;
378 }
379 break;
380 }
381 }
382 return result;
383 } break;
384
385 case State::kExpectArrayEnd:
386 case State::kExpectObjectEnd: {
387 const TokenType result = state_ == State::kExpectArrayEnd
388 ? TokenType::kEndArray
389 : TokenType::kEndObject;
390 // This is a transient state so we can send 2 tokens out in a row. We
391 // discover the object or array end at the end of reading the value.
392 object_type_.pop_back();
393 if (object_type_.size() == 0) {
394 // We unwound the outer object. We should send kEnd next.
395 state_ = State::kExpectEnd;
396 } else if (object_type_.back() == ObjectType::kObject) {
397 // If we are going into an object, it should either have another field
398 // or end.
399 if (Consume(",")) {
400 ConsumeWhitespace();
401 state_ = State::kExpectField;
402 } else if (Consume("}")) {
403 ConsumeWhitespace();
404 state_ = State::kExpectObjectEnd;
405 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800406 fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700407 return TokenType::kError;
408 }
409 } else if (object_type_.back() == ObjectType::kArray) {
410 // If we are going into an array, it should either have another value
411 // or end.
412 if (Consume(",")) {
413 ConsumeWhitespace();
414 state_ = State::kExpectValue;
415 } else if (Consume("]")) {
416 ConsumeWhitespace();
417 state_ = State::kExpectArrayEnd;
418 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800419 fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700420 return TokenType::kError;
421 }
422 }
423 // And then send out the correct token.
424 return result;
425 }
426 case State::kExpectEnd:
427 // If we are supposed to be done, confirm nothing is after the end.
428 if (AtEnd()) {
429 return TokenType::kEnd;
430 } else {
431 fprintf(stderr, "Data past end at line %d\n", linenumber_);
432 return TokenType::kError;
433 }
434 }
435 return TokenType::kError;
436}
437
438bool Tokenizer::FieldAsInt(long long *value) {
439 const char *pos = field_value().c_str();
440 errno = 0;
441 *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
442 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
443 return false;
444 }
445 return true;
446}
447
448bool Tokenizer::FieldAsDouble(double *value) {
449 const char *pos = field_value().c_str();
450 errno = 0;
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800451 if (field_value() == "nan") {
452 *value = std::numeric_limits<double>::quiet_NaN();
453 return true;
454 } else if (field_value() == "-nan") {
455 *value = -std::numeric_limits<double>::quiet_NaN();
456 return true;
457 }
458
Austin Schuhd7e252d2019-10-06 13:51:02 -0700459 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
460
461 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
462 return false;
463 }
464 return true;
465}
466
467} // namespace aos