blob: 2486227a8003d1913c363913cc9c5445e3b6c6c5 [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
Alex Perrycb7da4b2019-08-28 19:35:56 -07003#include <cerrno>
4
Austin Schuhd7e252d2019-10-06 13:51:02 -07005namespace aos {
6
7void Tokenizer::ConsumeWhitespace() {
8 while (true) {
9 if (AtEnd()) {
10 return;
11 }
12 // Skip any whitespace.
13 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
14 ConsumeChar();
15 } else if (Char() == '\n') {
16 ConsumeChar();
17 ++linenumber_;
Austin Schuh81da4b22019-10-06 14:03:24 -070018 } else if (Consume("/*")) {
19 while (!Consume("*/")) {
20 if (Char() == '\n') {
21 ++linenumber_;
22 }
23 ConsumeChar();
24 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070025 } else {
26 // There is no fail. Once we are out of whitespace (including 0 of it),
27 // declare success.
28 return;
29 }
30 }
31}
32
33bool Tokenizer::Consume(const char *token) {
James Kuszmaul3ae42262019-11-08 12:33:41 -080034 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070035 while (true) {
36 // Finishing the token is success.
37 if (*token == '\0') {
38 return true;
39 }
40
41 // But finishing the data first is failure.
42 if (AtEnd()) {
43 data_ = original;
44 return false;
45 }
46
47 // Missmatch is failure.
48 if (*token != Char()) {
49 data_ = original;
50 return false;
51 }
52
53 ConsumeChar();
54 ++token;
55 }
56}
57
58bool Tokenizer::ConsumeString(::std::string *s) {
59 // Under no conditions is it acceptible to run out of data while parsing a
60 // string. Any AtEnd checks should confirm that.
James Kuszmaul3ae42262019-11-08 12:33:41 -080061 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070062 if (AtEnd()) {
63 return false;
64 }
65
66 // Expect the leading "
67 if (Char() != '"') {
68 return false;
69 }
70
71 ConsumeChar();
James Kuszmaul3ae42262019-11-08 12:33:41 -080072 std::string_view last_parsed_data = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070073 *s = ::std::string();
74
75 while (true) {
76 if (AtEnd()) {
77 data_ = original;
78 return false;
79 }
80
81 // If we get an end or an escape, do something special.
82 if (Char() == '"' || Char() == '\\') {
83 // Save what we found up until now, not including this character.
84 *s += ::std::string(
85 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
86
87 // Update the pointer.
88 last_parsed_data = data_;
89
90 // " is the end, declare victory.
91 if (Char() == '"') {
92 ConsumeChar();
93 return true;
94 } else {
95 ConsumeChar();
96 // Now consume valid escape characters and add their representation onto
97 // the output string.
98 if (AtEnd()) {
99 data_ = original;
100 return false;
101 } else if (Char() == '"') {
102 *s += "\"";
103 } else if (Char() == '\\') {
104 *s += "\\";
105 } else if (Char() == '/') {
106 *s += "/";
107 } else if (Char() == 'b') {
108 *s += "\b";
109 } else if (Char() == 'f') {
110 *s += "\f";
111 } else if (Char() == 'n') {
112 *s += "\n";
113 } else if (Char() == 'r') {
114 *s += "\r";
115 } else if (Char() == 't') {
116 *s += "\t";
117 } else if (Char() == 'u') {
118 // TODO(austin): Unicode should be valid, but I really don't care to
119 // do this now...
120 fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
121 data_ = original;
122 return false;
123 }
124 }
125 // And skip the escaped character.
126 last_parsed_data = data_.substr(1);
127 }
128
129 ConsumeChar();
130 }
131}
132
133bool Tokenizer::ConsumeNumber(::std::string *s) {
134 // Under no conditions is it acceptible to run out of data while parsing a
135 // number. Any AtEnd() checks should confirm that.
136 *s = ::std::string();
James Kuszmaul3ae42262019-11-08 12:33:41 -0800137 const std::string_view original = data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -0700138
139 // Consume the leading - unconditionally.
140 Consume("-");
141
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800142 // See if we find nan. This isn't standards compliant, but is what
143 // flatbuffers prints out, so we need to parse it.
144 if (Consume("nan")) {
145 *s = ::std::string(original.substr(0, original.size() - data_.size()));
146 return true;
147 }
148
Austin Schuhd7e252d2019-10-06 13:51:02 -0700149 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
150 // by a second number.
151 if (!Consume("0")) {
152 if (AtEnd()) {
153 return false;
154 } else if (Char() >= '1' && Char() <= '9') {
155 // This wasn't a zero, but was a valid digit. Consume it.
156 ConsumeChar();
157 } else {
158 return false;
159 }
160
161 // Now consume any number of any digits.
162 while (true) {
163 if (AtEnd()) {
164 data_ = original;
165 return false;
166 }
167 if (Char() < '0' || Char() > '9') {
168 break;
169 }
170 ConsumeChar();
171 }
172 }
173
174 // We could now have a decimal.
175 if (Char() == '.') {
176 ConsumeChar();
177 while (true) {
178 if (AtEnd()) {
179 data_ = original;
180 return false;
181 }
182 // And any number of digits.
183 if (Char() < '0' || Char() > '9') {
184 break;
185 }
186 ConsumeChar();
187 }
188 }
189
190 // And now an exponent.
191 if (Char() == 'e' || Char() == 'E') {
192 ConsumeChar();
193 if (AtEnd()) {
194 data_ = original;
195 return false;
196 }
197
198 // Which could have a +-
199 if (Char() == '+' || Char() == '-') {
200 ConsumeChar();
201 }
202 int count = 0;
203 while (true) {
204 if (AtEnd()) {
205 data_ = original;
206 return false;
207 }
208 // And digits.
209 if (Char() < '0' || Char() > '9') {
210 break;
211 }
212 ConsumeChar();
213 ++count;
214 }
215 // But, it is an error to have an exponent and nothing following it.
216 if (count == 0) {
217 data_ = original;
218 return false;
219 }
220 }
221
222 *s = ::std::string(original.substr(0, original.size() - data_.size()));
223 return true;
224}
225
226Tokenizer::TokenType Tokenizer::Next() {
227 switch (state_) {
228 case State::kExpectObjectStart:
229 // We should always start out with a {
230 if (!Consume("{")) return TokenType::kError;
231
232 // Document that we just started an object.
233 object_type_.push_back(ObjectType::kObject);
234
235 ConsumeWhitespace();
236
237 if (Consume("}")) {
238 ConsumeWhitespace();
239 state_ = State::kExpectObjectEnd;
240 } else {
241 state_ = State::kExpectField;
242 }
243 return TokenType::kStartObject;
244
245 case State::kExpectField: {
246 // Fields are built up of strings, whitespace, and then a : (followed by
247 // whitespace...)
248 ::std::string s;
249 if (!ConsumeString(&s)) {
250 fprintf(stderr, "Error on line %d, expected string for field name.\n",
251 linenumber_);
Austin Schuh92700842019-12-28 13:00:17 -0800252 if (Consume("}")) {
253 fprintf(stderr,
254 "Got '}' instead. Did you add an extra trailing ','?\n");
255 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700256 return TokenType::kError;
257 }
258 field_name_ = ::std::move(s);
259
260 ConsumeWhitespace();
261
262 if (!Consume(":")) {
263 fprintf(stderr, "Error on line %d\n", linenumber_);
264 return TokenType::kError;
265 }
266
267 ConsumeWhitespace();
268
269 state_ = State::kExpectValue;
270
271 return TokenType::kField;
272 } break;
273 case State::kExpectValue: {
274 TokenType result = TokenType::kError;
275
276 ::std::string s;
277 if (Consume("{")) {
278 // Fields are in objects. Record and recurse.
279 object_type_.push_back(ObjectType::kObject);
280
281 ConsumeWhitespace();
282
Alex Perrycb7da4b2019-08-28 19:35:56 -0700283 // And then if we encounter the end again, go to the end state.
284 if (Consume("}")) {
285 ConsumeWhitespace();
286 state_ = State::kExpectObjectEnd;
287 } else {
288 state_ = State::kExpectField;
289 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700290 return TokenType::kStartObject;
291 } else if (Consume("[")) {
292 // Values are in arrays. Record and recurse.
293 object_type_.push_back(ObjectType::kArray);
294
295 ConsumeWhitespace();
296 state_ = State::kExpectValue;
297 return TokenType::kStartArray;
298 } else if (ConsumeString(&s)) {
299 // Parsed as a string, grab it.
300 field_value_ = ::std::move(s);
301 result = TokenType::kStringValue;
302 } else if (ConsumeNumber(&s)) {
303 // Parsed as a number, grab it.
304 field_value_ = ::std::move(s);
305 result = TokenType::kNumberValue;
306 } else if (Consume("true")) {
307 // Parsed as a true, grab it.
308 field_value_ = "true";
309 result = TokenType::kTrueValue;
310 } else if (Consume("false")) {
311 // Parsed as a false, grab it.
312 field_value_ = "false";
313 result = TokenType::kFalseValue;
314 } else {
Alex Perrycb7da4b2019-08-28 19:35:56 -0700315 switch (object_type_.back()) {
316 case ObjectType::kObject:
317 if (Consume("}")) {
318 ConsumeWhitespace();
319 state_ = State::kExpectObjectEnd;
320 return Next();
321 }
322 break;
323 case ObjectType::kArray:
324 if (Consume("]")) {
325 ConsumeWhitespace();
326 state_ = State::kExpectArrayEnd;
327 return Next();
328 }
329 break;
330 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700331 // Couldn't parse, so we have a syntax error.
332 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
333 }
334
335 ConsumeWhitespace();
336
337 // After a field, we either have a , and another field (or value if we are
338 // in an array), or we should be closing out the object (or array).
339 if (Consume(",")) {
340 ConsumeWhitespace();
341 switch (object_type_.back()) {
342 case ObjectType::kObject:
343 state_ = State::kExpectField;
344 break;
345 case ObjectType::kArray:
346 state_ = State::kExpectValue;
347 break;
348 }
349 } else {
350 // Sanity check that the stack is deep enough.
351 if (object_type_.size() == 0) {
352 fprintf(stderr, "Error on line %d\n", linenumber_);
353 return TokenType::kError;
354 }
355
356 // And then require closing out the object.
357 switch (object_type_.back()) {
358 case ObjectType::kObject:
359 if (Consume("}")) {
360 ConsumeWhitespace();
361 state_ = State::kExpectObjectEnd;
362 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800363 fprintf(stderr, "Error on line %d, expected } or ,\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700364 return TokenType::kError;
365 }
366 break;
367 case ObjectType::kArray:
368 if (Consume("]")) {
369 ConsumeWhitespace();
370 state_ = State::kExpectArrayEnd;
371 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800372 fprintf(stderr, "Error on line %d, expected ] or ,\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700373 return TokenType::kError;
374 }
375 break;
376 }
377 }
378 return result;
379 } break;
380
381 case State::kExpectArrayEnd:
382 case State::kExpectObjectEnd: {
383 const TokenType result = state_ == State::kExpectArrayEnd
384 ? TokenType::kEndArray
385 : TokenType::kEndObject;
386 // This is a transient state so we can send 2 tokens out in a row. We
387 // discover the object or array end at the end of reading the value.
388 object_type_.pop_back();
389 if (object_type_.size() == 0) {
390 // We unwound the outer object. We should send kEnd next.
391 state_ = State::kExpectEnd;
392 } else if (object_type_.back() == ObjectType::kObject) {
393 // If we are going into an object, it should either have another field
394 // or end.
395 if (Consume(",")) {
396 ConsumeWhitespace();
397 state_ = State::kExpectField;
398 } else if (Consume("}")) {
399 ConsumeWhitespace();
400 state_ = State::kExpectObjectEnd;
401 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800402 fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700403 return TokenType::kError;
404 }
405 } else if (object_type_.back() == ObjectType::kArray) {
406 // If we are going into an array, it should either have another value
407 // or end.
408 if (Consume(",")) {
409 ConsumeWhitespace();
410 state_ = State::kExpectValue;
411 } else if (Consume("]")) {
412 ConsumeWhitespace();
413 state_ = State::kExpectArrayEnd;
414 } else {
Austin Schuh217a9782019-12-21 23:02:50 -0800415 fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuhd7e252d2019-10-06 13:51:02 -0700416 return TokenType::kError;
417 }
418 }
419 // And then send out the correct token.
420 return result;
421 }
422 case State::kExpectEnd:
423 // If we are supposed to be done, confirm nothing is after the end.
424 if (AtEnd()) {
425 return TokenType::kEnd;
426 } else {
427 fprintf(stderr, "Data past end at line %d\n", linenumber_);
428 return TokenType::kError;
429 }
430 }
431 return TokenType::kError;
432}
433
434bool Tokenizer::FieldAsInt(long long *value) {
435 const char *pos = field_value().c_str();
436 errno = 0;
437 *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
438 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
439 return false;
440 }
441 return true;
442}
443
444bool Tokenizer::FieldAsDouble(double *value) {
445 const char *pos = field_value().c_str();
446 errno = 0;
Austin Schuhbba0c3c2019-11-29 22:00:34 -0800447 if (field_value() == "nan") {
448 *value = std::numeric_limits<double>::quiet_NaN();
449 return true;
450 } else if (field_value() == "-nan") {
451 *value = -std::numeric_limits<double>::quiet_NaN();
452 return true;
453 }
454
Austin Schuhd7e252d2019-10-06 13:51:02 -0700455 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
456
457 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
458 return false;
459 }
460 return true;
461}
462
463} // namespace aos