blob: 78bf46ef64edd7e7cb79c8cb2724c99d5c22e131 [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
3namespace aos {
4
5void Tokenizer::ConsumeWhitespace() {
6 while (true) {
7 if (AtEnd()) {
8 return;
9 }
10 // Skip any whitespace.
11 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
12 ConsumeChar();
13 } else if (Char() == '\n') {
14 ConsumeChar();
15 ++linenumber_;
Austin Schuh81da4b22019-10-06 14:03:24 -070016 } else if (Consume("/*")) {
17 while (!Consume("*/")) {
18 if (Char() == '\n') {
19 ++linenumber_;
20 }
21 ConsumeChar();
22 }
Austin Schuhd7e252d2019-10-06 13:51:02 -070023 } else {
24 // There is no fail. Once we are out of whitespace (including 0 of it),
25 // declare success.
26 return;
27 }
28 }
29}
30
31bool Tokenizer::Consume(const char *token) {
32 const absl::string_view original = data_;
33 while (true) {
34 // Finishing the token is success.
35 if (*token == '\0') {
36 return true;
37 }
38
39 // But finishing the data first is failure.
40 if (AtEnd()) {
41 data_ = original;
42 return false;
43 }
44
45 // Missmatch is failure.
46 if (*token != Char()) {
47 data_ = original;
48 return false;
49 }
50
51 ConsumeChar();
52 ++token;
53 }
54}
55
56bool Tokenizer::ConsumeString(::std::string *s) {
57 // Under no conditions is it acceptible to run out of data while parsing a
58 // string. Any AtEnd checks should confirm that.
59 const absl::string_view original = data_;
60 if (AtEnd()) {
61 return false;
62 }
63
64 // Expect the leading "
65 if (Char() != '"') {
66 return false;
67 }
68
69 ConsumeChar();
70 absl::string_view last_parsed_data = data_;
71 *s = ::std::string();
72
73 while (true) {
74 if (AtEnd()) {
75 data_ = original;
76 return false;
77 }
78
79 // If we get an end or an escape, do something special.
80 if (Char() == '"' || Char() == '\\') {
81 // Save what we found up until now, not including this character.
82 *s += ::std::string(
83 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
84
85 // Update the pointer.
86 last_parsed_data = data_;
87
88 // " is the end, declare victory.
89 if (Char() == '"') {
90 ConsumeChar();
91 return true;
92 } else {
93 ConsumeChar();
94 // Now consume valid escape characters and add their representation onto
95 // the output string.
96 if (AtEnd()) {
97 data_ = original;
98 return false;
99 } else if (Char() == '"') {
100 *s += "\"";
101 } else if (Char() == '\\') {
102 *s += "\\";
103 } else if (Char() == '/') {
104 *s += "/";
105 } else if (Char() == 'b') {
106 *s += "\b";
107 } else if (Char() == 'f') {
108 *s += "\f";
109 } else if (Char() == 'n') {
110 *s += "\n";
111 } else if (Char() == 'r') {
112 *s += "\r";
113 } else if (Char() == 't') {
114 *s += "\t";
115 } else if (Char() == 'u') {
116 // TODO(austin): Unicode should be valid, but I really don't care to
117 // do this now...
118 fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
119 data_ = original;
120 return false;
121 }
122 }
123 // And skip the escaped character.
124 last_parsed_data = data_.substr(1);
125 }
126
127 ConsumeChar();
128 }
129}
130
131bool Tokenizer::ConsumeNumber(::std::string *s) {
132 // Under no conditions is it acceptible to run out of data while parsing a
133 // number. Any AtEnd() checks should confirm that.
134 *s = ::std::string();
135 const absl::string_view original = data_;
136
137 // Consume the leading - unconditionally.
138 Consume("-");
139
140 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
141 // by a second number.
142 if (!Consume("0")) {
143 if (AtEnd()) {
144 return false;
145 } else if (Char() >= '1' && Char() <= '9') {
146 // This wasn't a zero, but was a valid digit. Consume it.
147 ConsumeChar();
148 } else {
149 return false;
150 }
151
152 // Now consume any number of any digits.
153 while (true) {
154 if (AtEnd()) {
155 data_ = original;
156 return false;
157 }
158 if (Char() < '0' || Char() > '9') {
159 break;
160 }
161 ConsumeChar();
162 }
163 }
164
165 // We could now have a decimal.
166 if (Char() == '.') {
167 ConsumeChar();
168 while (true) {
169 if (AtEnd()) {
170 data_ = original;
171 return false;
172 }
173 // And any number of digits.
174 if (Char() < '0' || Char() > '9') {
175 break;
176 }
177 ConsumeChar();
178 }
179 }
180
181 // And now an exponent.
182 if (Char() == 'e' || Char() == 'E') {
183 ConsumeChar();
184 if (AtEnd()) {
185 data_ = original;
186 return false;
187 }
188
189 // Which could have a +-
190 if (Char() == '+' || Char() == '-') {
191 ConsumeChar();
192 }
193 int count = 0;
194 while (true) {
195 if (AtEnd()) {
196 data_ = original;
197 return false;
198 }
199 // And digits.
200 if (Char() < '0' || Char() > '9') {
201 break;
202 }
203 ConsumeChar();
204 ++count;
205 }
206 // But, it is an error to have an exponent and nothing following it.
207 if (count == 0) {
208 data_ = original;
209 return false;
210 }
211 }
212
213 *s = ::std::string(original.substr(0, original.size() - data_.size()));
214 return true;
215}
216
217Tokenizer::TokenType Tokenizer::Next() {
218 switch (state_) {
219 case State::kExpectObjectStart:
220 // We should always start out with a {
221 if (!Consume("{")) return TokenType::kError;
222
223 // Document that we just started an object.
224 object_type_.push_back(ObjectType::kObject);
225
226 ConsumeWhitespace();
227
228 if (Consume("}")) {
229 ConsumeWhitespace();
230 state_ = State::kExpectObjectEnd;
231 } else {
232 state_ = State::kExpectField;
233 }
234 return TokenType::kStartObject;
235
236 case State::kExpectField: {
237 // Fields are built up of strings, whitespace, and then a : (followed by
238 // whitespace...)
239 ::std::string s;
240 if (!ConsumeString(&s)) {
241 fprintf(stderr, "Error on line %d, expected string for field name.\n",
242 linenumber_);
243 return TokenType::kError;
244 }
245 field_name_ = ::std::move(s);
246
247 ConsumeWhitespace();
248
249 if (!Consume(":")) {
250 fprintf(stderr, "Error on line %d\n", linenumber_);
251 return TokenType::kError;
252 }
253
254 ConsumeWhitespace();
255
256 state_ = State::kExpectValue;
257
258 return TokenType::kField;
259 } break;
260 case State::kExpectValue: {
261 TokenType result = TokenType::kError;
262
263 ::std::string s;
264 if (Consume("{")) {
265 // Fields are in objects. Record and recurse.
266 object_type_.push_back(ObjectType::kObject);
267
268 ConsumeWhitespace();
269
270 state_ = State::kExpectField;
271 return TokenType::kStartObject;
272 } else if (Consume("[")) {
273 // Values are in arrays. Record and recurse.
274 object_type_.push_back(ObjectType::kArray);
275
276 ConsumeWhitespace();
277 state_ = State::kExpectValue;
278 return TokenType::kStartArray;
279 } else if (ConsumeString(&s)) {
280 // Parsed as a string, grab it.
281 field_value_ = ::std::move(s);
282 result = TokenType::kStringValue;
283 } else if (ConsumeNumber(&s)) {
284 // Parsed as a number, grab it.
285 field_value_ = ::std::move(s);
286 result = TokenType::kNumberValue;
287 } else if (Consume("true")) {
288 // Parsed as a true, grab it.
289 field_value_ = "true";
290 result = TokenType::kTrueValue;
291 } else if (Consume("false")) {
292 // Parsed as a false, grab it.
293 field_value_ = "false";
294 result = TokenType::kFalseValue;
295 } else {
296 // Couldn't parse, so we have a syntax error.
297 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
298 }
299
300 ConsumeWhitespace();
301
302 // After a field, we either have a , and another field (or value if we are
303 // in an array), or we should be closing out the object (or array).
304 if (Consume(",")) {
305 ConsumeWhitespace();
306 switch (object_type_.back()) {
307 case ObjectType::kObject:
308 state_ = State::kExpectField;
309 break;
310 case ObjectType::kArray:
311 state_ = State::kExpectValue;
312 break;
313 }
314 } else {
315 // Sanity check that the stack is deep enough.
316 if (object_type_.size() == 0) {
317 fprintf(stderr, "Error on line %d\n", linenumber_);
318 return TokenType::kError;
319 }
320
321 // And then require closing out the object.
322 switch (object_type_.back()) {
323 case ObjectType::kObject:
324 if (Consume("}")) {
325 ConsumeWhitespace();
326 state_ = State::kExpectObjectEnd;
327 } else {
328 return TokenType::kError;
329 }
330 break;
331 case ObjectType::kArray:
332 if (Consume("]")) {
333 ConsumeWhitespace();
334 state_ = State::kExpectArrayEnd;
335 } else {
336 return TokenType::kError;
337 }
338 break;
339 }
340 }
341 return result;
342 } break;
343
344 case State::kExpectArrayEnd:
345 case State::kExpectObjectEnd: {
346 const TokenType result = state_ == State::kExpectArrayEnd
347 ? TokenType::kEndArray
348 : TokenType::kEndObject;
349 // This is a transient state so we can send 2 tokens out in a row. We
350 // discover the object or array end at the end of reading the value.
351 object_type_.pop_back();
352 if (object_type_.size() == 0) {
353 // We unwound the outer object. We should send kEnd next.
354 state_ = State::kExpectEnd;
355 } else if (object_type_.back() == ObjectType::kObject) {
356 // If we are going into an object, it should either have another field
357 // or end.
358 if (Consume(",")) {
359 ConsumeWhitespace();
360 state_ = State::kExpectField;
361 } else if (Consume("}")) {
362 ConsumeWhitespace();
363 state_ = State::kExpectObjectEnd;
364 } else {
365 return TokenType::kError;
366 }
367 } else if (object_type_.back() == ObjectType::kArray) {
368 // If we are going into an array, it should either have another value
369 // or end.
370 if (Consume(",")) {
371 ConsumeWhitespace();
372 state_ = State::kExpectValue;
373 } else if (Consume("]")) {
374 ConsumeWhitespace();
375 state_ = State::kExpectArrayEnd;
376 } else {
377 return TokenType::kError;
378 }
379 }
380 // And then send out the correct token.
381 return result;
382 }
383 case State::kExpectEnd:
384 // If we are supposed to be done, confirm nothing is after the end.
385 if (AtEnd()) {
386 return TokenType::kEnd;
387 } else {
388 fprintf(stderr, "Data past end at line %d\n", linenumber_);
389 return TokenType::kError;
390 }
391 }
392 return TokenType::kError;
393}
394
395bool Tokenizer::FieldAsInt(long long *value) {
396 const char *pos = field_value().c_str();
397 errno = 0;
398 *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
399 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
400 return false;
401 }
402 return true;
403}
404
405bool Tokenizer::FieldAsDouble(double *value) {
406 const char *pos = field_value().c_str();
407 errno = 0;
408 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
409
410 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
411 return false;
412 }
413 return true;
414}
415
416} // namespace aos