blob: 38ff4e368b086bf5d33af141936ab64e2151c756 [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#include "aos/json_tokenizer.h"
2
3namespace aos {
4
5void Tokenizer::ConsumeWhitespace() {
6 while (true) {
7 if (AtEnd()) {
8 return;
9 }
10 // Skip any whitespace.
11 if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
12 ConsumeChar();
13 } else if (Char() == '\n') {
14 ConsumeChar();
15 ++linenumber_;
16 } else {
17 // There is no fail. Once we are out of whitespace (including 0 of it),
18 // declare success.
19 return;
20 }
21 }
22}
23
24bool Tokenizer::Consume(const char *token) {
25 const absl::string_view original = data_;
26 while (true) {
27 // Finishing the token is success.
28 if (*token == '\0') {
29 return true;
30 }
31
32 // But finishing the data first is failure.
33 if (AtEnd()) {
34 data_ = original;
35 return false;
36 }
37
38 // Missmatch is failure.
39 if (*token != Char()) {
40 data_ = original;
41 return false;
42 }
43
44 ConsumeChar();
45 ++token;
46 }
47}
48
49bool Tokenizer::ConsumeString(::std::string *s) {
50 // Under no conditions is it acceptible to run out of data while parsing a
51 // string. Any AtEnd checks should confirm that.
52 const absl::string_view original = data_;
53 if (AtEnd()) {
54 return false;
55 }
56
57 // Expect the leading "
58 if (Char() != '"') {
59 return false;
60 }
61
62 ConsumeChar();
63 absl::string_view last_parsed_data = data_;
64 *s = ::std::string();
65
66 while (true) {
67 if (AtEnd()) {
68 data_ = original;
69 return false;
70 }
71
72 // If we get an end or an escape, do something special.
73 if (Char() == '"' || Char() == '\\') {
74 // Save what we found up until now, not including this character.
75 *s += ::std::string(
76 last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
77
78 // Update the pointer.
79 last_parsed_data = data_;
80
81 // " is the end, declare victory.
82 if (Char() == '"') {
83 ConsumeChar();
84 return true;
85 } else {
86 ConsumeChar();
87 // Now consume valid escape characters and add their representation onto
88 // the output string.
89 if (AtEnd()) {
90 data_ = original;
91 return false;
92 } else if (Char() == '"') {
93 *s += "\"";
94 } else if (Char() == '\\') {
95 *s += "\\";
96 } else if (Char() == '/') {
97 *s += "/";
98 } else if (Char() == 'b') {
99 *s += "\b";
100 } else if (Char() == 'f') {
101 *s += "\f";
102 } else if (Char() == 'n') {
103 *s += "\n";
104 } else if (Char() == 'r') {
105 *s += "\r";
106 } else if (Char() == 't') {
107 *s += "\t";
108 } else if (Char() == 'u') {
109 // TODO(austin): Unicode should be valid, but I really don't care to
110 // do this now...
111 fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
112 data_ = original;
113 return false;
114 }
115 }
116 // And skip the escaped character.
117 last_parsed_data = data_.substr(1);
118 }
119
120 ConsumeChar();
121 }
122}
123
124bool Tokenizer::ConsumeNumber(::std::string *s) {
125 // Under no conditions is it acceptible to run out of data while parsing a
126 // number. Any AtEnd() checks should confirm that.
127 *s = ::std::string();
128 const absl::string_view original = data_;
129
130 // Consume the leading - unconditionally.
131 Consume("-");
132
133 // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
134 // by a second number.
135 if (!Consume("0")) {
136 if (AtEnd()) {
137 return false;
138 } else if (Char() >= '1' && Char() <= '9') {
139 // This wasn't a zero, but was a valid digit. Consume it.
140 ConsumeChar();
141 } else {
142 return false;
143 }
144
145 // Now consume any number of any digits.
146 while (true) {
147 if (AtEnd()) {
148 data_ = original;
149 return false;
150 }
151 if (Char() < '0' || Char() > '9') {
152 break;
153 }
154 ConsumeChar();
155 }
156 }
157
158 // We could now have a decimal.
159 if (Char() == '.') {
160 ConsumeChar();
161 while (true) {
162 if (AtEnd()) {
163 data_ = original;
164 return false;
165 }
166 // And any number of digits.
167 if (Char() < '0' || Char() > '9') {
168 break;
169 }
170 ConsumeChar();
171 }
172 }
173
174 // And now an exponent.
175 if (Char() == 'e' || Char() == 'E') {
176 ConsumeChar();
177 if (AtEnd()) {
178 data_ = original;
179 return false;
180 }
181
182 // Which could have a +-
183 if (Char() == '+' || Char() == '-') {
184 ConsumeChar();
185 }
186 int count = 0;
187 while (true) {
188 if (AtEnd()) {
189 data_ = original;
190 return false;
191 }
192 // And digits.
193 if (Char() < '0' || Char() > '9') {
194 break;
195 }
196 ConsumeChar();
197 ++count;
198 }
199 // But, it is an error to have an exponent and nothing following it.
200 if (count == 0) {
201 data_ = original;
202 return false;
203 }
204 }
205
206 *s = ::std::string(original.substr(0, original.size() - data_.size()));
207 return true;
208}
209
210Tokenizer::TokenType Tokenizer::Next() {
211 switch (state_) {
212 case State::kExpectObjectStart:
213 // We should always start out with a {
214 if (!Consume("{")) return TokenType::kError;
215
216 // Document that we just started an object.
217 object_type_.push_back(ObjectType::kObject);
218
219 ConsumeWhitespace();
220
221 if (Consume("}")) {
222 ConsumeWhitespace();
223 state_ = State::kExpectObjectEnd;
224 } else {
225 state_ = State::kExpectField;
226 }
227 return TokenType::kStartObject;
228
229 case State::kExpectField: {
230 // Fields are built up of strings, whitespace, and then a : (followed by
231 // whitespace...)
232 ::std::string s;
233 if (!ConsumeString(&s)) {
234 fprintf(stderr, "Error on line %d, expected string for field name.\n",
235 linenumber_);
236 return TokenType::kError;
237 }
238 field_name_ = ::std::move(s);
239
240 ConsumeWhitespace();
241
242 if (!Consume(":")) {
243 fprintf(stderr, "Error on line %d\n", linenumber_);
244 return TokenType::kError;
245 }
246
247 ConsumeWhitespace();
248
249 state_ = State::kExpectValue;
250
251 return TokenType::kField;
252 } break;
253 case State::kExpectValue: {
254 TokenType result = TokenType::kError;
255
256 ::std::string s;
257 if (Consume("{")) {
258 // Fields are in objects. Record and recurse.
259 object_type_.push_back(ObjectType::kObject);
260
261 ConsumeWhitespace();
262
263 state_ = State::kExpectField;
264 return TokenType::kStartObject;
265 } else if (Consume("[")) {
266 // Values are in arrays. Record and recurse.
267 object_type_.push_back(ObjectType::kArray);
268
269 ConsumeWhitespace();
270 state_ = State::kExpectValue;
271 return TokenType::kStartArray;
272 } else if (ConsumeString(&s)) {
273 // Parsed as a string, grab it.
274 field_value_ = ::std::move(s);
275 result = TokenType::kStringValue;
276 } else if (ConsumeNumber(&s)) {
277 // Parsed as a number, grab it.
278 field_value_ = ::std::move(s);
279 result = TokenType::kNumberValue;
280 } else if (Consume("true")) {
281 // Parsed as a true, grab it.
282 field_value_ = "true";
283 result = TokenType::kTrueValue;
284 } else if (Consume("false")) {
285 // Parsed as a false, grab it.
286 field_value_ = "false";
287 result = TokenType::kFalseValue;
288 } else {
289 // Couldn't parse, so we have a syntax error.
290 fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
291 }
292
293 ConsumeWhitespace();
294
295 // After a field, we either have a , and another field (or value if we are
296 // in an array), or we should be closing out the object (or array).
297 if (Consume(",")) {
298 ConsumeWhitespace();
299 switch (object_type_.back()) {
300 case ObjectType::kObject:
301 state_ = State::kExpectField;
302 break;
303 case ObjectType::kArray:
304 state_ = State::kExpectValue;
305 break;
306 }
307 } else {
308 // Sanity check that the stack is deep enough.
309 if (object_type_.size() == 0) {
310 fprintf(stderr, "Error on line %d\n", linenumber_);
311 return TokenType::kError;
312 }
313
314 // And then require closing out the object.
315 switch (object_type_.back()) {
316 case ObjectType::kObject:
317 if (Consume("}")) {
318 ConsumeWhitespace();
319 state_ = State::kExpectObjectEnd;
320 } else {
321 return TokenType::kError;
322 }
323 break;
324 case ObjectType::kArray:
325 if (Consume("]")) {
326 ConsumeWhitespace();
327 state_ = State::kExpectArrayEnd;
328 } else {
329 return TokenType::kError;
330 }
331 break;
332 }
333 }
334 return result;
335 } break;
336
337 case State::kExpectArrayEnd:
338 case State::kExpectObjectEnd: {
339 const TokenType result = state_ == State::kExpectArrayEnd
340 ? TokenType::kEndArray
341 : TokenType::kEndObject;
342 // This is a transient state so we can send 2 tokens out in a row. We
343 // discover the object or array end at the end of reading the value.
344 object_type_.pop_back();
345 if (object_type_.size() == 0) {
346 // We unwound the outer object. We should send kEnd next.
347 state_ = State::kExpectEnd;
348 } else if (object_type_.back() == ObjectType::kObject) {
349 // If we are going into an object, it should either have another field
350 // or end.
351 if (Consume(",")) {
352 ConsumeWhitespace();
353 state_ = State::kExpectField;
354 } else if (Consume("}")) {
355 ConsumeWhitespace();
356 state_ = State::kExpectObjectEnd;
357 } else {
358 return TokenType::kError;
359 }
360 } else if (object_type_.back() == ObjectType::kArray) {
361 // If we are going into an array, it should either have another value
362 // or end.
363 if (Consume(",")) {
364 ConsumeWhitespace();
365 state_ = State::kExpectValue;
366 } else if (Consume("]")) {
367 ConsumeWhitespace();
368 state_ = State::kExpectArrayEnd;
369 } else {
370 return TokenType::kError;
371 }
372 }
373 // And then send out the correct token.
374 return result;
375 }
376 case State::kExpectEnd:
377 // If we are supposed to be done, confirm nothing is after the end.
378 if (AtEnd()) {
379 return TokenType::kEnd;
380 } else {
381 fprintf(stderr, "Data past end at line %d\n", linenumber_);
382 return TokenType::kError;
383 }
384 }
385 return TokenType::kError;
386}
387
388bool Tokenizer::FieldAsInt(long long *value) {
389 const char *pos = field_value().c_str();
390 errno = 0;
391 *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
392 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
393 return false;
394 }
395 return true;
396}
397
398bool Tokenizer::FieldAsDouble(double *value) {
399 const char *pos = field_value().c_str();
400 errno = 0;
401 *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
402
403 if (pos != field_value().c_str() + field_value().size() || errno != 0) {
404 return false;
405 }
406 return true;
407}
408
409} // namespace aos