Factor tokenizer out to a separate file.
json_to_flatbuffer.{cc,h} was getting too big.
Change-Id: I38c10001b15da7fa2c0bfef66be5e640d1fa446f
diff --git a/aos/json_tokenizer.cc b/aos/json_tokenizer.cc
new file mode 100644
index 0000000..38ff4e3
--- /dev/null
+++ b/aos/json_tokenizer.cc
@@ -0,0 +1,409 @@
+#include "aos/json_tokenizer.h"
+
+namespace aos {
+
+void Tokenizer::ConsumeWhitespace() {
+ while (true) {
+ if (AtEnd()) {
+ return;
+ }
+ // Skip any whitespace.
+ if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
+ ConsumeChar();
+ } else if (Char() == '\n') {
+ ConsumeChar();
+ ++linenumber_;
+ } else {
+ // There is no fail. Once we are out of whitespace (including 0 of it),
+ // declare success.
+ return;
+ }
+ }
+}
+
+bool Tokenizer::Consume(const char *token) {
+ const absl::string_view original = data_;
+ while (true) {
+ // Finishing the token is success.
+ if (*token == '\0') {
+ return true;
+ }
+
+ // But finishing the data first is failure.
+ if (AtEnd()) {
+ data_ = original;
+ return false;
+ }
+
+ // Missmatch is failure.
+ if (*token != Char()) {
+ data_ = original;
+ return false;
+ }
+
+ ConsumeChar();
+ ++token;
+ }
+}
+
+bool Tokenizer::ConsumeString(::std::string *s) {
+ // Under no conditions is it acceptible to run out of data while parsing a
+ // string. Any AtEnd checks should confirm that.
+ const absl::string_view original = data_;
+ if (AtEnd()) {
+ return false;
+ }
+
+ // Expect the leading "
+ if (Char() != '"') {
+ return false;
+ }
+
+ ConsumeChar();
+ absl::string_view last_parsed_data = data_;
+ *s = ::std::string();
+
+ while (true) {
+ if (AtEnd()) {
+ data_ = original;
+ return false;
+ }
+
+ // If we get an end or an escape, do something special.
+ if (Char() == '"' || Char() == '\\') {
+ // Save what we found up until now, not including this character.
+ *s += ::std::string(
+ last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
+
+ // Update the pointer.
+ last_parsed_data = data_;
+
+ // " is the end, declare victory.
+ if (Char() == '"') {
+ ConsumeChar();
+ return true;
+ } else {
+ ConsumeChar();
+ // Now consume valid escape characters and add their representation onto
+ // the output string.
+ if (AtEnd()) {
+ data_ = original;
+ return false;
+ } else if (Char() == '"') {
+ *s += "\"";
+ } else if (Char() == '\\') {
+ *s += "\\";
+ } else if (Char() == '/') {
+ *s += "/";
+ } else if (Char() == 'b') {
+ *s += "\b";
+ } else if (Char() == 'f') {
+ *s += "\f";
+ } else if (Char() == 'n') {
+ *s += "\n";
+ } else if (Char() == 'r') {
+ *s += "\r";
+ } else if (Char() == 't') {
+ *s += "\t";
+ } else if (Char() == 'u') {
+ // TODO(austin): Unicode should be valid, but I really don't care to
+ // do this now...
+ fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
+ data_ = original;
+ return false;
+ }
+ }
+ // And skip the escaped character.
+ last_parsed_data = data_.substr(1);
+ }
+
+ ConsumeChar();
+ }
+}
+
+bool Tokenizer::ConsumeNumber(::std::string *s) {
+ // Under no conditions is it acceptible to run out of data while parsing a
+ // number. Any AtEnd() checks should confirm that.
+ *s = ::std::string();
+ const absl::string_view original = data_;
+
+ // Consume the leading - unconditionally.
+ Consume("-");
+
+ // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
+ // by a second number.
+ if (!Consume("0")) {
+ if (AtEnd()) {
+ return false;
+ } else if (Char() >= '1' && Char() <= '9') {
+ // This wasn't a zero, but was a valid digit. Consume it.
+ ConsumeChar();
+ } else {
+ return false;
+ }
+
+ // Now consume any number of any digits.
+ while (true) {
+ if (AtEnd()) {
+ data_ = original;
+ return false;
+ }
+ if (Char() < '0' || Char() > '9') {
+ break;
+ }
+ ConsumeChar();
+ }
+ }
+
+ // We could now have a decimal.
+ if (Char() == '.') {
+ ConsumeChar();
+ while (true) {
+ if (AtEnd()) {
+ data_ = original;
+ return false;
+ }
+ // And any number of digits.
+ if (Char() < '0' || Char() > '9') {
+ break;
+ }
+ ConsumeChar();
+ }
+ }
+
+ // And now an exponent.
+ if (Char() == 'e' || Char() == 'E') {
+ ConsumeChar();
+ if (AtEnd()) {
+ data_ = original;
+ return false;
+ }
+
+ // Which could have a +-
+ if (Char() == '+' || Char() == '-') {
+ ConsumeChar();
+ }
+ int count = 0;
+ while (true) {
+ if (AtEnd()) {
+ data_ = original;
+ return false;
+ }
+ // And digits.
+ if (Char() < '0' || Char() > '9') {
+ break;
+ }
+ ConsumeChar();
+ ++count;
+ }
+ // But, it is an error to have an exponent and nothing following it.
+ if (count == 0) {
+ data_ = original;
+ return false;
+ }
+ }
+
+ *s = ::std::string(original.substr(0, original.size() - data_.size()));
+ return true;
+}
+
+Tokenizer::TokenType Tokenizer::Next() {
+ switch (state_) {
+ case State::kExpectObjectStart:
+ // We should always start out with a {
+ if (!Consume("{")) return TokenType::kError;
+
+ // Document that we just started an object.
+ object_type_.push_back(ObjectType::kObject);
+
+ ConsumeWhitespace();
+
+ if (Consume("}")) {
+ ConsumeWhitespace();
+ state_ = State::kExpectObjectEnd;
+ } else {
+ state_ = State::kExpectField;
+ }
+ return TokenType::kStartObject;
+
+ case State::kExpectField: {
+ // Fields are built up of strings, whitespace, and then a : (followed by
+ // whitespace...)
+ ::std::string s;
+ if (!ConsumeString(&s)) {
+ fprintf(stderr, "Error on line %d, expected string for field name.\n",
+ linenumber_);
+ return TokenType::kError;
+ }
+ field_name_ = ::std::move(s);
+
+ ConsumeWhitespace();
+
+ if (!Consume(":")) {
+ fprintf(stderr, "Error on line %d\n", linenumber_);
+ return TokenType::kError;
+ }
+
+ ConsumeWhitespace();
+
+ state_ = State::kExpectValue;
+
+ return TokenType::kField;
+ } break;
+ case State::kExpectValue: {
+ TokenType result = TokenType::kError;
+
+ ::std::string s;
+ if (Consume("{")) {
+ // Fields are in objects. Record and recurse.
+ object_type_.push_back(ObjectType::kObject);
+
+ ConsumeWhitespace();
+
+ state_ = State::kExpectField;
+ return TokenType::kStartObject;
+ } else if (Consume("[")) {
+ // Values are in arrays. Record and recurse.
+ object_type_.push_back(ObjectType::kArray);
+
+ ConsumeWhitespace();
+ state_ = State::kExpectValue;
+ return TokenType::kStartArray;
+ } else if (ConsumeString(&s)) {
+ // Parsed as a string, grab it.
+ field_value_ = ::std::move(s);
+ result = TokenType::kStringValue;
+ } else if (ConsumeNumber(&s)) {
+ // Parsed as a number, grab it.
+ field_value_ = ::std::move(s);
+ result = TokenType::kNumberValue;
+ } else if (Consume("true")) {
+ // Parsed as a true, grab it.
+ field_value_ = "true";
+ result = TokenType::kTrueValue;
+ } else if (Consume("false")) {
+ // Parsed as a false, grab it.
+ field_value_ = "false";
+ result = TokenType::kFalseValue;
+ } else {
+ // Couldn't parse, so we have a syntax error.
+ fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
+ }
+
+ ConsumeWhitespace();
+
+ // After a field, we either have a , and another field (or value if we are
+ // in an array), or we should be closing out the object (or array).
+ if (Consume(",")) {
+ ConsumeWhitespace();
+ switch (object_type_.back()) {
+ case ObjectType::kObject:
+ state_ = State::kExpectField;
+ break;
+ case ObjectType::kArray:
+ state_ = State::kExpectValue;
+ break;
+ }
+ } else {
+ // Sanity check that the stack is deep enough.
+ if (object_type_.size() == 0) {
+ fprintf(stderr, "Error on line %d\n", linenumber_);
+ return TokenType::kError;
+ }
+
+ // And then require closing out the object.
+ switch (object_type_.back()) {
+ case ObjectType::kObject:
+ if (Consume("}")) {
+ ConsumeWhitespace();
+ state_ = State::kExpectObjectEnd;
+ } else {
+ return TokenType::kError;
+ }
+ break;
+ case ObjectType::kArray:
+ if (Consume("]")) {
+ ConsumeWhitespace();
+ state_ = State::kExpectArrayEnd;
+ } else {
+ return TokenType::kError;
+ }
+ break;
+ }
+ }
+ return result;
+ } break;
+
+ case State::kExpectArrayEnd:
+ case State::kExpectObjectEnd: {
+ const TokenType result = state_ == State::kExpectArrayEnd
+ ? TokenType::kEndArray
+ : TokenType::kEndObject;
+ // This is a transient state so we can send 2 tokens out in a row. We
+ // discover the object or array end at the end of reading the value.
+ object_type_.pop_back();
+ if (object_type_.size() == 0) {
+ // We unwound the outer object. We should send kEnd next.
+ state_ = State::kExpectEnd;
+ } else if (object_type_.back() == ObjectType::kObject) {
+ // If we are going into an object, it should either have another field
+ // or end.
+ if (Consume(",")) {
+ ConsumeWhitespace();
+ state_ = State::kExpectField;
+ } else if (Consume("}")) {
+ ConsumeWhitespace();
+ state_ = State::kExpectObjectEnd;
+ } else {
+ return TokenType::kError;
+ }
+ } else if (object_type_.back() == ObjectType::kArray) {
+ // If we are going into an array, it should either have another value
+ // or end.
+ if (Consume(",")) {
+ ConsumeWhitespace();
+ state_ = State::kExpectValue;
+ } else if (Consume("]")) {
+ ConsumeWhitespace();
+ state_ = State::kExpectArrayEnd;
+ } else {
+ return TokenType::kError;
+ }
+ }
+ // And then send out the correct token.
+ return result;
+ }
+ case State::kExpectEnd:
+ // If we are supposed to be done, confirm nothing is after the end.
+ if (AtEnd()) {
+ return TokenType::kEnd;
+ } else {
+ fprintf(stderr, "Data past end at line %d\n", linenumber_);
+ return TokenType::kError;
+ }
+ }
+ return TokenType::kError;
+}
+
+bool Tokenizer::FieldAsInt(long long *value) {
+ const char *pos = field_value().c_str();
+ errno = 0;
+ *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
+ if (pos != field_value().c_str() + field_value().size() || errno != 0) {
+ return false;
+ }
+ return true;
+}
+
+bool Tokenizer::FieldAsDouble(double *value) {
+ const char *pos = field_value().c_str();
+ errno = 0;
+ *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
+
+ if (pos != field_value().c_str() + field_value().size() || errno != 0) {
+ return false;
+ }
+ return true;
+}
+
+} // namespace aos