blob: e3aa7ea34f10fd4e827f62d6092c5a1cdb4dbdd0 [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
Alex Perrycb7da4b2019-08-28 19:35:56 -07003#include <cerrno>
4
Austin Schuhd7e252d2019-10-06 13:51:02 -07005namespace aos {
6
7void Tokenizer::ConsumeWhitespace() {
8 while (true) {
9 if (AtEnd()) {
10 return;
11 }
12 // Skip any whitespace.
13 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
14 ConsumeChar();
15 } else if (Char() == '\n') {
16 ConsumeChar();
17 ++linenumber_;
Austin Schuh81da4b22019-10-06 14:03:24 -070018 } else if (Consume("/*")) {
19 while (!Consume("*/")) {
20 if (Char() == '\n') {
21 ++linenumber_;
22 }
23 ConsumeChar();
24 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070025 } else {
26 // There is no fail. Once we are out of whitespace (including 0 of it),
27 // declare success.
28 return;
29 }
30 }
31}
32
33bool Tokenizer::Consume(const char *token) {
34 const absl::string_view original = data_;
35 while (true) {
36 // Finishing the token is success.
37 if (*token == '\0') {
38 return true;
39 }
40
41 // But finishing the data first is failure.
42 if (AtEnd()) {
43 data_ = original;
44 return false;
45 }
46
47 // Missmatch is failure.
48 if (*token != Char()) {
49 data_ = original;
50 return false;
51 }
52
53 ConsumeChar();
54 ++token;
55 }
56}
57
58bool Tokenizer::ConsumeString(::std::string *s) {
59 // Under no conditions is it acceptible to run out of data while parsing a
60 // string. Any AtEnd checks should confirm that.
61 const absl::string_view original = data_;
62 if (AtEnd()) {
63 return false;
64 }
65
66 // Expect the leading "
67 if (Char() != '"') {
68 return false;
69 }
70
71 ConsumeChar();
72 absl::string_view last_parsed_data = data_;
73 *s = ::std::string();
74
75 while (true) {
76 if (AtEnd()) {
77 data_ = original;
78 return false;
79 }
80
81 // If we get an end or an escape, do something special.
82 if (Char() == '"' || Char() == '\\') {
83 // Save what we found up until now, not including this character.
84 *s += ::std::string(
85 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
86
87 // Update the pointer.
88 last_parsed_data = data_;
89
90 // " is the end, declare victory.
91 if (Char() == '"') {
92 ConsumeChar();
93 return true;
94 } else {
95 ConsumeChar();
96 // Now consume valid escape characters and add their representation onto
97 // the output string.
98 if (AtEnd()) {
99 data_ = original;
100 return false;
101 } else if (Char() == '"') {
102 *s += "\"";
103 } else if (Char() == '\\') {
104 *s += "\\";
105 } else if (Char() == '/') {
106 *s += "/";
107 } else if (Char() == 'b') {
108 *s += "\b";
109 } else if (Char() == 'f') {
110 *s += "\f";
111 } else if (Char() == 'n') {
112 *s += "\n";
113 } else if (Char() == 'r') {
114 *s += "\r";
115 } else if (Char() == 't') {
116 *s += "\t";
117 } else if (Char() == 'u') {
118 // TODO(austin): Unicode should be valid, but I really don't care to
119 // do this now...
120 fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
121 data_ = original;
122 return false;
123 }
124 }
125 // And skip the escaped character.
126 last_parsed_data = data_.substr(1);
127 }
128
129 ConsumeChar();
130 }
131}
132
133bool Tokenizer::ConsumeNumber(::std::string *s) {
134 // Under no conditions is it acceptible to run out of data while parsing a
135 // number. Any AtEnd() checks should confirm that.
136 *s = ::std::string();
137 const absl::string_view original = data_;
138
139 // Consume the leading - unconditionally.
140 Consume("-");
141
142 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
143 // by a second number.
144 if (!Consume("0")) {
145 if (AtEnd()) {
146 return false;
147 } else if (Char() >= '1' && Char() <= '9') {
148 // This wasn't a zero, but was a valid digit. Consume it.
149 ConsumeChar();
150 } else {
151 return false;
152 }
153
154 // Now consume any number of any digits.
155 while (true) {
156 if (AtEnd()) {
157 data_ = original;
158 return false;
159 }
160 if (Char() < '0' || Char() > '9') {
161 break;
162 }
163 ConsumeChar();
164 }
165 }
166
167 // We could now have a decimal.
168 if (Char() == '.') {
169 ConsumeChar();
170 while (true) {
171 if (AtEnd()) {
172 data_ = original;
173 return false;
174 }
175 // And any number of digits.
176 if (Char() < '0' || Char() > '9') {
177 break;
178 }
179 ConsumeChar();
180 }
181 }
182
183 // And now an exponent.
184 if (Char() == 'e' || Char() == 'E') {
185 ConsumeChar();
186 if (AtEnd()) {
187 data_ = original;
188 return false;
189 }
190
191 // Which could have a +-
192 if (Char() == '+' || Char() == '-') {
193 ConsumeChar();
194 }
195 int count = 0;
196 while (true) {
197 if (AtEnd()) {
198 data_ = original;
199 return false;
200 }
201 // And digits.
202 if (Char() < '0' || Char() > '9') {
203 break;
204 }
205 ConsumeChar();
206 ++count;
207 }
208 // But, it is an error to have an exponent and nothing following it.
209 if (count == 0) {
210 data_ = original;
211 return false;
212 }
213 }
214
215 *s = ::std::string(original.substr(0, original.size() - data_.size()));
216 return true;
217}
218
219Tokenizer::TokenType Tokenizer::Next() {
220 switch (state_) {
221 case State::kExpectObjectStart:
222 // We should always start out with a {
223 if (!Consume("{")) return TokenType::kError;
224
225 // Document that we just started an object.
226 object_type_.push_back(ObjectType::kObject);
227
228 ConsumeWhitespace();
229
230 if (Consume("}")) {
231 ConsumeWhitespace();
232 state_ = State::kExpectObjectEnd;
233 } else {
234 state_ = State::kExpectField;
235 }
236 return TokenType::kStartObject;
237
238 case State::kExpectField: {
239 // Fields are built up of strings, whitespace, and then a : (followed by
240 // whitespace...)
241 ::std::string s;
242 if (!ConsumeString(&s)) {
243 fprintf(stderr, "Error on line %d, expected string for field name.\n",
244 linenumber_);
245 return TokenType::kError;
246 }
247 field_name_ = ::std::move(s);
248
249 ConsumeWhitespace();
250
251 if (!Consume(":")) {
252 fprintf(stderr, "Error on line %d\n", linenumber_);
253 return TokenType::kError;
254 }
255
256 ConsumeWhitespace();
257
258 state_ = State::kExpectValue;
259
260 return TokenType::kField;
261 } break;
262 case State::kExpectValue: {
263 TokenType result = TokenType::kError;
264
265 ::std::string s;
266 if (Consume("{")) {
267 // Fields are in objects. Record and recurse.
268 object_type_.push_back(ObjectType::kObject);
269
270 ConsumeWhitespace();
271
Alex Perrycb7da4b2019-08-28 19:35:56 -0700272 // And then if we encounter the end again, go to the end state.
273 if (Consume("}")) {
274 ConsumeWhitespace();
275 state_ = State::kExpectObjectEnd;
276 } else {
277 state_ = State::kExpectField;
278 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700279 return TokenType::kStartObject;
280 } else if (Consume("[")) {
281 // Values are in arrays. Record and recurse.
282 object_type_.push_back(ObjectType::kArray);
283
284 ConsumeWhitespace();
285 state_ = State::kExpectValue;
286 return TokenType::kStartArray;
287 } else if (ConsumeString(&s)) {
288 // Parsed as a string, grab it.
289 field_value_ = ::std::move(s);
290 result = TokenType::kStringValue;
291 } else if (ConsumeNumber(&s)) {
292 // Parsed as a number, grab it.
293 field_value_ = ::std::move(s);
294 result = TokenType::kNumberValue;
295 } else if (Consume("true")) {
296 // Parsed as a true, grab it.
297 field_value_ = "true";
298 result = TokenType::kTrueValue;
299 } else if (Consume("false")) {
300 // Parsed as a false, grab it.
301 field_value_ = "false";
302 result = TokenType::kFalseValue;
303 } else {
Alex Perrycb7da4b2019-08-28 19:35:56 -0700304 switch (object_type_.back()) {
305 case ObjectType::kObject:
306 if (Consume("}")) {
307 ConsumeWhitespace();
308 state_ = State::kExpectObjectEnd;
309 return Next();
310 }
311 break;
312 case ObjectType::kArray:
313 if (Consume("]")) {
314 ConsumeWhitespace();
315 state_ = State::kExpectArrayEnd;
316 return Next();
317 }
318 break;
319 }
Austin Schuhd7e252d2019-10-06 13:51:02 -0700320 // Couldn't parse, so we have a syntax error.
321 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
322 }
323
324 ConsumeWhitespace();
325
326 // After a field, we either have a , and another field (or value if we are
327 // in an array), or we should be closing out the object (or array).
328 if (Consume(",")) {
329 ConsumeWhitespace();
330 switch (object_type_.back()) {
331 case ObjectType::kObject:
332 state_ = State::kExpectField;
333 break;
334 case ObjectType::kArray:
335 state_ = State::kExpectValue;
336 break;
337 }
338 } else {
339 // Sanity check that the stack is deep enough.
340 if (object_type_.size() == 0) {
341 fprintf(stderr, "Error on line %d\n", linenumber_);
342 return TokenType::kError;
343 }
344
345 // And then require closing out the object.
346 switch (object_type_.back()) {
347 case ObjectType::kObject:
348 if (Consume("}")) {
349 ConsumeWhitespace();
350 state_ = State::kExpectObjectEnd;
351 } else {
352 return TokenType::kError;
353 }
354 break;
355 case ObjectType::kArray:
356 if (Consume("]")) {
357 ConsumeWhitespace();
358 state_ = State::kExpectArrayEnd;
359 } else {
360 return TokenType::kError;
361 }
362 break;
363 }
364 }
365 return result;
366 } break;
367
368 case State::kExpectArrayEnd:
369 case State::kExpectObjectEnd: {
370 const TokenType result = state_ == State::kExpectArrayEnd
371 ? TokenType::kEndArray
372 : TokenType::kEndObject;
373 // This is a transient state so we can send 2 tokens out in a row. We
374 // discover the object or array end at the end of reading the value.
375 object_type_.pop_back();
376 if (object_type_.size() == 0) {
377 // We unwound the outer object. We should send kEnd next.
378 state_ = State::kExpectEnd;
379 } else if (object_type_.back() == ObjectType::kObject) {
380 // If we are going into an object, it should either have another field
381 // or end.
382 if (Consume(",")) {
383 ConsumeWhitespace();
384 state_ = State::kExpectField;
385 } else if (Consume("}")) {
386 ConsumeWhitespace();
387 state_ = State::kExpectObjectEnd;
388 } else {
389 return TokenType::kError;
390 }
391 } else if (object_type_.back() == ObjectType::kArray) {
392 // If we are going into an array, it should either have another value
393 // or end.
394 if (Consume(",")) {
395 ConsumeWhitespace();
396 state_ = State::kExpectValue;
397 } else if (Consume("]")) {
398 ConsumeWhitespace();
399 state_ = State::kExpectArrayEnd;
400 } else {
401 return TokenType::kError;
402 }
403 }
404 // And then send out the correct token.
405 return result;
406 }
407 case State::kExpectEnd:
408 // If we are supposed to be done, confirm nothing is after the end.
409 if (AtEnd()) {
410 return TokenType::kEnd;
411 } else {
412 fprintf(stderr, "Data past end at line %d\n", linenumber_);
413 return TokenType::kError;
414 }
415 }
416 return TokenType::kError;
417}
418
419bool Tokenizer::FieldAsInt(long long *value) {
420 const char *pos = field_value().c_str();
421 errno = 0;
422 *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
423 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
424 return false;
425 }
426 return true;
427}
428
429bool Tokenizer::FieldAsDouble(double *value) {
430 const char *pos = field_value().c_str();
431 errno = 0;
432 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
433
434 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
435 return false;
436 }
437 return true;
438}
439
440} // namespace aos