Factor tokenizer out to a separate file. json_to_flatbuffer.{cc,h} was getting too big. Change-Id: I38c10001b15da7fa2c0bfef66be5e640d1fa446f

commit: d7e252dfc4c196a328d11ea4e10ae60a18920141 [log] [tgz]
author: Austin Schuh <austin.linux@gmail.com> Sun Oct 06 13:51:02 2019 -0700
committer: Austin Schuh <austin.linux@gmail.com> Sun Oct 06 13:59:24 2019 -0700
tree: 1f4aca8aa21943ed2748d32a0c27bd8eba0a6f2f
parent: d339a9bc566bc721695e8110ad302367dbd27c7f [diff] [blame]
diff --git a/aos/json_tokenizer.cc b/aos/json_tokenizer.cc
new file mode 100644
index 0000000..38ff4e3
--- /dev/null
+++ b/aos/json_tokenizer.cc

@@ -0,0 +1,409 @@
+#include "aos/json_tokenizer.h"
+
+namespace aos {
+
+void Tokenizer::ConsumeWhitespace() {
+  while (true) {
+    if (AtEnd()) {
+      return;
+    }
+    // Skip any whitespace.
+    if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
+      ConsumeChar();
+    } else if (Char() == '\n') {
+      ConsumeChar();
+      ++linenumber_;
+    } else {
+      // There is no fail.  Once we are out of whitespace (including 0 of it),
+      // declare success.
+      return;
+    }
+  }
+}
+
+bool Tokenizer::Consume(const char *token) {
+  const absl::string_view original = data_;
+  while (true) {
+    // Finishing the token is success.
+    if (*token == '\0') {
+      return true;
+    }
+
+    // But finishing the data first is failure.
+    if (AtEnd()) {
+      data_ = original;
+      return false;
+    }
+
+    // Missmatch is failure.
+    if (*token != Char()) {
+      data_ = original;
+      return false;
+    }
+
+    ConsumeChar();
+    ++token;
+  }
+}
+
+bool Tokenizer::ConsumeString(::std::string *s) {
+  // Under no conditions is it acceptible to run out of data while parsing a
+  // string.  Any AtEnd checks should confirm that.
+  const absl::string_view original = data_;
+  if (AtEnd()) {
+    return false;
+  }
+
+  // Expect the leading "
+  if (Char() != '"') {
+    return false;
+  }
+
+  ConsumeChar();
+  absl::string_view last_parsed_data = data_;
+  *s = ::std::string();
+
+  while (true) {
+    if (AtEnd()) {
+      data_ = original;
+      return false;
+    }
+
+    // If we get an end or an escape, do something special.
+    if (Char() == '"' || Char() == '\\') {
+      // Save what we found up until now, not including this character.
+      *s += ::std::string(
+          last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
+
+      // Update the pointer.
+      last_parsed_data = data_;
+
+      // " is the end, declare victory.
+      if (Char() == '"') {
+        ConsumeChar();
+        return true;
+      } else {
+        ConsumeChar();
+        // Now consume valid escape characters and add their representation onto
+        // the output string.
+        if (AtEnd()) {
+          data_ = original;
+          return false;
+        } else if (Char() == '"') {
+          *s += "\"";
+        } else if (Char() == '\\') {
+          *s += "\\";
+        } else if (Char() == '/') {
+          *s += "/";
+        } else if (Char() == 'b') {
+          *s += "\b";
+        } else if (Char() == 'f') {
+          *s += "\f";
+        } else if (Char() == 'n') {
+          *s += "\n";
+        } else if (Char() == 'r') {
+          *s += "\r";
+        } else if (Char() == 't') {
+          *s += "\t";
+        } else if (Char() == 'u') {
+          // TODO(austin): Unicode should be valid, but I really don't care to
+          // do this now...
+          fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
+          data_ = original;
+          return false;
+        }
+      }
+      // And skip the escaped character.
+      last_parsed_data = data_.substr(1);
+    }
+
+    ConsumeChar();
+  }
+}
+
+bool Tokenizer::ConsumeNumber(::std::string *s) {
+  // Under no conditions is it acceptible to run out of data while parsing a
+  // number.  Any AtEnd() checks should confirm that.
+  *s = ::std::string();
+  const absl::string_view original = data_;
+
+  // Consume the leading - unconditionally.
+  Consume("-");
+
+  // Then, we either get a 0, or we get a nonzero.  Only nonzero can be followed
+  // by a second number.
+  if (!Consume("0")) {
+    if (AtEnd()) {
+      return false;
+    } else if (Char() >= '1' && Char() <= '9') {
+      // This wasn't a zero, but was a valid digit.  Consume it.
+      ConsumeChar();
+    } else {
+      return false;
+    }
+
+    // Now consume any number of any digits.
+    while (true) {
+      if (AtEnd()) {
+        data_ = original;
+        return false;
+      }
+      if (Char() < '0' || Char() > '9') {
+        break;
+      }
+      ConsumeChar();
+    }
+  }
+
+  // We could now have a decimal.
+  if (Char() == '.') {
+    ConsumeChar();
+    while (true) {
+      if (AtEnd()) {
+        data_ = original;
+        return false;
+      }
+      // And any number of digits.
+      if (Char() < '0' || Char() > '9') {
+        break;
+      }
+      ConsumeChar();
+    }
+  }
+
+  // And now an exponent.
+  if (Char() == 'e' || Char() == 'E') {
+    ConsumeChar();
+    if (AtEnd()) {
+      data_ = original;
+      return false;
+    }
+
+    // Which could have a +-
+    if (Char() == '+' || Char() == '-') {
+      ConsumeChar();
+    }
+    int count = 0;
+    while (true) {
+      if (AtEnd()) {
+        data_ = original;
+        return false;
+      }
+      // And digits.
+      if (Char() < '0' || Char() > '9') {
+        break;
+      }
+      ConsumeChar();
+      ++count;
+    }
+    // But, it is an error to have an exponent and nothing following it.
+    if (count == 0) {
+      data_ = original;
+      return false;
+    }
+  }
+
+  *s = ::std::string(original.substr(0, original.size() - data_.size()));
+  return true;
+}
+
+Tokenizer::TokenType Tokenizer::Next() {
+  switch (state_) {
+    case State::kExpectObjectStart:
+      // We should always start out with a {
+      if (!Consume("{")) return TokenType::kError;
+
+      // Document that we just started an object.
+      object_type_.push_back(ObjectType::kObject);
+
+      ConsumeWhitespace();
+
+      if (Consume("}")) {
+        ConsumeWhitespace();
+        state_ = State::kExpectObjectEnd;
+      } else {
+        state_ = State::kExpectField;
+      }
+      return TokenType::kStartObject;
+
+    case State::kExpectField: {
+      // Fields are built up of strings, whitespace, and then a : (followed by
+      // whitespace...)
+      ::std::string s;
+      if (!ConsumeString(&s)) {
+        fprintf(stderr, "Error on line %d, expected string for field name.\n",
+                linenumber_);
+        return TokenType::kError;
+      }
+      field_name_ = ::std::move(s);
+
+      ConsumeWhitespace();
+
+      if (!Consume(":")) {
+        fprintf(stderr, "Error on line %d\n", linenumber_);
+        return TokenType::kError;
+      }
+
+      ConsumeWhitespace();
+
+      state_ = State::kExpectValue;
+
+      return TokenType::kField;
+    } break;
+    case State::kExpectValue: {
+      TokenType result = TokenType::kError;
+
+      ::std::string s;
+      if (Consume("{")) {
+        // Fields are in objects.  Record and recurse.
+        object_type_.push_back(ObjectType::kObject);
+
+        ConsumeWhitespace();
+
+        state_ = State::kExpectField;
+        return TokenType::kStartObject;
+      } else if (Consume("[")) {
+        // Values are in arrays.  Record and recurse.
+        object_type_.push_back(ObjectType::kArray);
+
+        ConsumeWhitespace();
+        state_ = State::kExpectValue;
+        return TokenType::kStartArray;
+      } else if (ConsumeString(&s)) {
+        // Parsed as a string, grab it.
+        field_value_ = ::std::move(s);
+        result = TokenType::kStringValue;
+      } else if (ConsumeNumber(&s)) {
+        // Parsed as a number, grab it.
+        field_value_ = ::std::move(s);
+        result = TokenType::kNumberValue;
+      } else if (Consume("true")) {
+        // Parsed as a true, grab it.
+        field_value_ = "true";
+        result = TokenType::kTrueValue;
+      } else if (Consume("false")) {
+        // Parsed as a false, grab it.
+        field_value_ = "false";
+        result = TokenType::kFalseValue;
+      } else {
+        // Couldn't parse, so we have a syntax error.
+        fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
+      }
+
+      ConsumeWhitespace();
+
+      // After a field, we either have a , and another field (or value if we are
+      // in an array), or we should be closing out the object (or array).
+      if (Consume(",")) {
+        ConsumeWhitespace();
+        switch (object_type_.back()) {
+          case ObjectType::kObject:
+            state_ = State::kExpectField;
+            break;
+          case ObjectType::kArray:
+            state_ = State::kExpectValue;
+            break;
+        }
+      } else {
+        // Sanity check that the stack is deep enough.
+        if (object_type_.size() == 0) {
+          fprintf(stderr, "Error on line %d\n", linenumber_);
+          return TokenType::kError;
+        }
+
+        // And then require closing out the object.
+        switch (object_type_.back()) {
+          case ObjectType::kObject:
+            if (Consume("}")) {
+              ConsumeWhitespace();
+              state_ = State::kExpectObjectEnd;
+            } else {
+              return TokenType::kError;
+            }
+            break;
+          case ObjectType::kArray:
+            if (Consume("]")) {
+              ConsumeWhitespace();
+              state_ = State::kExpectArrayEnd;
+            } else {
+              return TokenType::kError;
+            }
+            break;
+        }
+      }
+      return result;
+    } break;
+
+    case State::kExpectArrayEnd:
+    case State::kExpectObjectEnd: {
+      const TokenType result = state_ == State::kExpectArrayEnd
+                                   ? TokenType::kEndArray
+                                   : TokenType::kEndObject;
+      // This is a transient state so we can send 2 tokens out in a row.  We
+      // discover the object or array end at the end of reading the value.
+      object_type_.pop_back();
+      if (object_type_.size() == 0) {
+        // We unwound the outer object.  We should send kEnd next.
+        state_ = State::kExpectEnd;
+      } else if (object_type_.back() == ObjectType::kObject) {
+        // If we are going into an object, it should either have another field
+        // or end.
+        if (Consume(",")) {
+          ConsumeWhitespace();
+          state_ = State::kExpectField;
+        } else if (Consume("}")) {
+          ConsumeWhitespace();
+          state_ = State::kExpectObjectEnd;
+        } else {
+          return TokenType::kError;
+        }
+      } else if (object_type_.back() == ObjectType::kArray) {
+        // If we are going into an array, it should either have another value
+        // or end.
+        if (Consume(",")) {
+          ConsumeWhitespace();
+          state_ = State::kExpectValue;
+        } else if (Consume("]")) {
+          ConsumeWhitespace();
+          state_ = State::kExpectArrayEnd;
+        } else {
+          return TokenType::kError;
+        }
+      }
+      // And then send out the correct token.
+      return result;
+    }
+    case State::kExpectEnd:
+      // If we are supposed to be done, confirm nothing is after the end.
+      if (AtEnd()) {
+        return TokenType::kEnd;
+      } else {
+        fprintf(stderr, "Data past end at line %d\n", linenumber_);
+        return TokenType::kError;
+      }
+  }
+  return TokenType::kError;
+}
+
+bool Tokenizer::FieldAsInt(long long *value) {
+  const char *pos = field_value().c_str();
+  errno = 0;
+  *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
+  if (pos != field_value().c_str() + field_value().size() || errno != 0) {
+    return false;
+  }
+  return true;
+}
+
+bool Tokenizer::FieldAsDouble(double *value) {
+  const char *pos = field_value().c_str();
+  errno = 0;
+  *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
+
+  if (pos != field_value().c_str() + field_value().size() || errno != 0) {
+    return false;
+  }
+  return true;
+}
+
+}  // namespace aos
commit	d7e252dfc4c196a328d11ea4e10ae60a18920141	[log] [tgz]
author	Austin Schuh <austin.linux@gmail.com>	Sun Oct 06 13:51:02 2019 -0700
committer	Austin Schuh <austin.linux@gmail.com>	Sun Oct 06 13:59:24 2019 -0700
tree	1f4aca8aa21943ed2748d32a0c27bd8eba0a6f2f
parent	d339a9bc566bc721695e8110ad302367dbd27c7f [diff] [blame]