Factor tokenizer out to a separate file. json_to_flatbuffer.{cc,h} was getting too big. Change-Id: I38c10001b15da7fa2c0bfef66be5e640d1fa446f

commit: d7e252dfc4c196a328d11ea4e10ae60a18920141 [log] [tgz]
author: Austin Schuh <austin.linux@gmail.com> Sun Oct 06 13:51:02 2019 -0700
committer: Austin Schuh <austin.linux@gmail.com> Sun Oct 06 13:59:24 2019 -0700
tree: 1f4aca8aa21943ed2748d32a0c27bd8eba0a6f2f
parent: d339a9bc566bc721695e8110ad302367dbd27c7f [diff]
diff --git a/aos/BUILD b/aos/BUILD
index 0631e66..95418ea 100644
--- a/aos/BUILD
+++ b/aos/BUILD

@@ -426,11 +426,21 @@
 )
 
 cc_library(
+    name = "json_tokenizer",
+    srcs = ["json_tokenizer.cc"],
+    hdrs = ["json_tokenizer.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
     name = "json_to_flatbuffer",
     srcs = ["json_to_flatbuffer.cc"],
     hdrs = ["json_to_flatbuffer.h"],
     deps = [
         ":flatbuffer_utils",
+        ":json_tokenizer",
         "//aos/logging",
         "@com_github_google_flatbuffers//:flatbuffers",
         "@com_google_absl//absl/strings",

diff --git a/aos/json_to_flatbuffer.cc b/aos/json_to_flatbuffer.cc
index 68d3b03..85e2d7f 100644
--- a/aos/json_to_flatbuffer.cc
+++ b/aos/json_to_flatbuffer.cc

@@ -6,6 +6,7 @@
 #include "absl/strings/string_view.h"
 #include "aos/flatbuffer_utils.h"
 #include "aos/logging/logging.h"
+#include "aos/json_tokenizer.h"
 #include "flatbuffers/flatbuffers.h"
 #include "flatbuffers/minireflect.h"
 
@@ -729,408 +730,4 @@
   return tostring_visitor.s;
 }
 
-void Tokenizer::ConsumeWhitespace() {
-  while (true) {
-    if (AtEnd()) {
-      return;
-    }
-    // Skip any whitespace.
-    if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
-      ConsumeChar();
-    } else if (Char() == '\n') {
-      ConsumeChar();
-      ++linenumber_;
-    } else {
-      // There is no fail.  Once we are out of whitespace (including 0 of it),
-      // declare success.
-      return;
-    }
-  }
-}
-
-bool Tokenizer::Consume(const char *token) {
-  const absl::string_view original = data_;
-  while (true) {
-    // Finishing the token is success.
-    if (*token == '\0') {
-      return true;
-    }
-
-    // But finishing the data first is failure.
-    if (AtEnd()) {
-      data_ = original;
-      return false;
-    }
-
-    // Missmatch is failure.
-    if (*token != Char()) {
-      data_ = original;
-      return false;
-    }
-
-    ConsumeChar();
-    ++token;
-  }
-}
-
-bool Tokenizer::ConsumeString(::std::string *s) {
-  // Under no conditions is it acceptible to run out of data while parsing a
-  // string.  Any AtEnd checks should confirm that.
-  const absl::string_view original = data_;
-  if (AtEnd()) {
-    return false;
-  }
-
-  // Expect the leading "
-  if (Char() != '"') {
-    return false;
-  }
-
-  ConsumeChar();
-  absl::string_view last_parsed_data = data_;
-  *s = ::std::string();
-
-  while (true) {
-    if (AtEnd()) {
-      data_ = original;
-      return false;
-    }
-
-    // If we get an end or an escape, do something special.
-    if (Char() == '"' || Char() == '\\') {
-      // Save what we found up until now, not including this character.
-      *s += ::std::string(
-          last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
-
-      // Update the pointer.
-      last_parsed_data = data_;
-
-      // " is the end, declare victory.
-      if (Char() == '"') {
-        ConsumeChar();
-        return true;
-      } else {
-        ConsumeChar();
-        // Now consume valid escape characters and add their representation onto
-        // the output string.
-        if (AtEnd()) {
-          data_ = original;
-          return false;
-        } else if (Char() == '"') {
-          *s += "\"";
-        } else if (Char() == '\\') {
-          *s += "\\";
-        } else if (Char() == '/') {
-          *s += "/";
-        } else if (Char() == 'b') {
-          *s += "\b";
-        } else if (Char() == 'f') {
-          *s += "\f";
-        } else if (Char() == 'n') {
-          *s += "\n";
-        } else if (Char() == 'r') {
-          *s += "\r";
-        } else if (Char() == 't') {
-          *s += "\t";
-        } else if (Char() == 'u') {
-          // TODO(austin): Unicode should be valid, but I really don't care to
-          // do this now...
-          fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
-          data_ = original;
-          return false;
-        }
-      }
-      // And skip the escaped character.
-      last_parsed_data = data_.substr(1);
-    }
-
-    ConsumeChar();
-  }
-}
-
-bool Tokenizer::ConsumeNumber(::std::string *s) {
-  // Under no conditions is it acceptible to run out of data while parsing a
-  // number.  Any AtEnd() checks should confirm that.
-  *s = ::std::string();
-  const absl::string_view original = data_;
-
-  // Consume the leading - unconditionally.
-  Consume("-");
-
-  // Then, we either get a 0, or we get a nonzero.  Only nonzero can be followed
-  // by a second number.
-  if (!Consume("0")) {
-    if (AtEnd()) {
-      return false;
-    } else if (Char() >= '1' && Char() <= '9') {
-      // This wasn't a zero, but was a valid digit.  Consume it.
-      ConsumeChar();
-    } else {
-      return false;
-    }
-
-    // Now consume any number of any digits.
-    while (true) {
-      if (AtEnd()) {
-        data_ = original;
-        return false;
-      }
-      if (Char() < '0' || Char() > '9') {
-        break;
-      }
-      ConsumeChar();
-    }
-  }
-
-  // We could now have a decimal.
-  if (Char() == '.') {
-    ConsumeChar();
-    while (true) {
-      if (AtEnd()) {
-        data_ = original;
-        return false;
-      }
-      // And any number of digits.
-      if (Char() < '0' || Char() > '9') {
-        break;
-      }
-      ConsumeChar();
-    }
-  }
-
-  // And now an exponent.
-  if (Char() == 'e' || Char() == 'E') {
-    ConsumeChar();
-    if (AtEnd()) {
-      data_ = original;
-      return false;
-    }
-
-    // Which could have a +-
-    if (Char() == '+' || Char() == '-') {
-      ConsumeChar();
-    }
-    int count = 0;
-    while (true) {
-      if (AtEnd()) {
-        data_ = original;
-        return false;
-      }
-      // And digits.
-      if (Char() < '0' || Char() > '9') {
-        break;
-      }
-      ConsumeChar();
-      ++count;
-    }
-    // But, it is an error to have an exponent and nothing following it.
-    if (count == 0) {
-      data_ = original;
-      return false;
-    }
-  }
-
-  *s = ::std::string(original.substr(0, original.size() - data_.size()));
-  return true;
-}
-
-Tokenizer::TokenType Tokenizer::Next() {
-  switch (state_) {
-    case State::kExpectObjectStart:
-      // We should always start out with a {
-      if (!Consume("{")) return TokenType::kError;
-
-      // Document that we just started an object.
-      object_type_.push_back(ObjectType::kObject);
-
-      ConsumeWhitespace();
-
-      if (Consume("}")) {
-        ConsumeWhitespace();
-        state_ = State::kExpectObjectEnd;
-      } else {
-        state_ = State::kExpectField;
-      }
-      return TokenType::kStartObject;
-
-    case State::kExpectField: {
-      // Fields are built up of strings, whitespace, and then a : (followed by
-      // whitespace...)
-      ::std::string s;
-      if (!ConsumeString(&s)) {
-        fprintf(stderr, "Error on line %d, expected string for field name.\n",
-                linenumber_);
-        return TokenType::kError;
-      }
-      field_name_ = ::std::move(s);
-
-      ConsumeWhitespace();
-
-      if (!Consume(":")) {
-        fprintf(stderr, "Error on line %d\n", linenumber_);
-        return TokenType::kError;
-      }
-
-      ConsumeWhitespace();
-
-      state_ = State::kExpectValue;
-
-      return TokenType::kField;
-    } break;
-    case State::kExpectValue: {
-      TokenType result = TokenType::kError;
-
-      ::std::string s;
-      if (Consume("{")) {
-        // Fields are in objects.  Record and recurse.
-        object_type_.push_back(ObjectType::kObject);
-
-        ConsumeWhitespace();
-
-        state_ = State::kExpectField;
-        return TokenType::kStartObject;
-      } else if (Consume("[")) {
-        // Values are in arrays.  Record and recurse.
-        object_type_.push_back(ObjectType::kArray);
-
-        ConsumeWhitespace();
-        state_ = State::kExpectValue;
-        return TokenType::kStartArray;
-      } else if (ConsumeString(&s)) {
-        // Parsed as a string, grab it.
-        field_value_ = ::std::move(s);
-        result = TokenType::kStringValue;
-      } else if (ConsumeNumber(&s)) {
-        // Parsed as a number, grab it.
-        field_value_ = ::std::move(s);
-        result = TokenType::kNumberValue;
-      } else if (Consume("true")) {
-        // Parsed as a true, grab it.
-        field_value_ = "true";
-        result = TokenType::kTrueValue;
-      } else if (Consume("false")) {
-        // Parsed as a false, grab it.
-        field_value_ = "false";
-        result = TokenType::kFalseValue;
-      } else {
-        // Couldn't parse, so we have a syntax error.
-        fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
-      }
-
-      ConsumeWhitespace();
-
-      // After a field, we either have a , and another field (or value if we are
-      // in an array), or we should be closing out the object (or array).
-      if (Consume(",")) {
-        ConsumeWhitespace();
-        switch (object_type_.back()) {
-          case ObjectType::kObject:
-            state_ = State::kExpectField;
-            break;
-          case ObjectType::kArray:
-            state_ = State::kExpectValue;
-            break;
-        }
-      } else {
-        // Sanity check that the stack is deep enough.
-        if (object_type_.size() == 0) {
-          fprintf(stderr, "Error on line %d\n", linenumber_);
-          return TokenType::kError;
-        }
-
-        // And then require closing out the object.
-        switch (object_type_.back()) {
-          case ObjectType::kObject:
-            if (Consume("}")) {
-              ConsumeWhitespace();
-              state_ = State::kExpectObjectEnd;
-            } else {
-              return TokenType::kError;
-            }
-            break;
-          case ObjectType::kArray:
-            if (Consume("]")) {
-              ConsumeWhitespace();
-              state_ = State::kExpectArrayEnd;
-            } else {
-              return TokenType::kError;
-            }
-            break;
-        }
-      }
-      return result;
-    } break;
-
-    case State::kExpectArrayEnd:
-    case State::kExpectObjectEnd: {
-      const TokenType result = state_ == State::kExpectArrayEnd
-                                   ? TokenType::kEndArray
-                                   : TokenType::kEndObject;
-      // This is a transient state so we can send 2 tokens out in a row.  We
-      // discover the object or array end at the end of reading the value.
-      object_type_.pop_back();
-      if (object_type_.size() == 0) {
-        // We unwound the outer object.  We should send kEnd next.
-        state_ = State::kExpectEnd;
-      } else if (object_type_.back() == ObjectType::kObject) {
-        // If we are going into an object, it should either have another field
-        // or end.
-        if (Consume(",")) {
-          ConsumeWhitespace();
-          state_ = State::kExpectField;
-        } else if (Consume("}")) {
-          ConsumeWhitespace();
-          state_ = State::kExpectObjectEnd;
-        } else {
-          return TokenType::kError;
-        }
-      } else if (object_type_.back() == ObjectType::kArray) {
-        // If we are going into an array, it should either have another value
-        // or end.
-        if (Consume(",")) {
-          ConsumeWhitespace();
-          state_ = State::kExpectValue;
-        } else if (Consume("]")) {
-          ConsumeWhitespace();
-          state_ = State::kExpectArrayEnd;
-        } else {
-          return TokenType::kError;
-        }
-      }
-      // And then send out the correct token.
-      return result;
-    }
-    case State::kExpectEnd:
-      // If we are supposed to be done, confirm nothing is after the end.
-      if (AtEnd()) {
-        return TokenType::kEnd;
-      } else {
-        fprintf(stderr, "Data past end at line %d\n", linenumber_);
-        return TokenType::kError;
-      }
-  }
-  return TokenType::kError;
-}
-
-bool Tokenizer::FieldAsInt(long long *value) {
-  const char *pos = field_value().c_str();
-  errno = 0;
-  *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
-  if (pos != field_value().c_str() + field_value().size() || errno != 0) {
-    return false;
-  }
-  return true;
-}
-
-bool Tokenizer::FieldAsDouble(double *value) {
-  const char *pos = field_value().c_str();
-  errno = 0;
-  *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
-
-  if (pos != field_value().c_str() + field_value().size() || errno != 0) {
-    return false;
-  }
-  return true;
-}
-
 }  // namespace aos

diff --git a/aos/json_to_flatbuffer.h b/aos/json_to_flatbuffer.h
index 6a12a56..78d0703 100644
--- a/aos/json_to_flatbuffer.h
+++ b/aos/json_to_flatbuffer.h

@@ -20,98 +20,6 @@
                                const flatbuffers::TypeTable *typetable,
                                bool multi_line = false);
 
-// This class implements the state machine at json.org
-class Tokenizer {
- public:
-  Tokenizer(const absl::string_view data) : data_(data) {}
-
-  enum class TokenType {
-    kEnd,
-    kError,
-    kStartObject,
-    kEndObject,
-    kStartArray,
-    kEndArray,
-    kField,
-    kNumberValue,
-    kStringValue,
-    kTrueValue,
-    kFalseValue,
-  };
-
-  // Returns the next token.
-  TokenType Next();
-
-  // Returns the last field_name and field_value.  These are only valid when
-  // Next returns them.
-  const ::std::string &field_name() const { return field_name_; }
-  const ::std::string &field_value() const { return field_value_; }
-
-  // Parses the current field value as a long long.  Returns false if it failed
-  // to parse.
-  bool FieldAsInt(long long *value);
-  // Parses the current field value as a double.  Returns false if it failed
-  // to parse.
-  bool FieldAsDouble(double *value);
-
-  // Returns true if we are at the end of the input.
-  bool AtEnd() { return data_.size() == 0; }
-
-  const absl::string_view data_left() const { return data_; }
-
- private:
-  // Consumes a single character.
-  void ConsumeChar() { data_ = data_.substr(1); }
-
-  // Returns the current character.
-  char Char() const { return data_[0]; }
-
-  // Consumes a string out of data_.  Populates s with the string.  Returns true
-  // if a valid string was found, and false otherwise.
-  // data_ is updated only on success.
-  bool ConsumeString(::std::string *s);
-  // Consumes a number out of data_.  Populates s with the string containing the
-  // number.  Returns true if a valid number was found, and false otherwise.
-  // data_ is updated only on success.
-  bool ConsumeNumber(::std::string *s);
-  // Consumes a fixed token out of data_. Returns true if the string was found,
-  // and false otherwise.
-  // data_ is updated only on success.
-  bool Consume(const char* token);
-  // Consumes whitespace out of data_. Returns true if the string was found,
-  // and false otherwise.
-  // data_ is unconditionally updated.
-  void ConsumeWhitespace();
-
-  // State for the parsing state machine.
-  enum class State {
-    kExpectField,
-    kExpectObjectStart,
-    kExpectObjectEnd,
-    kExpectArrayEnd,
-    kExpectValue,
-    kExpectEnd,
-  };
-
-  State state_ = State::kExpectObjectStart;
-
-  // Data pointer.
-  absl::string_view data_;
-  // Current line number used for printing debug.
-  int linenumber_ = 0;
-
-  // Stack used to track which object type we were in when we recursed.
-  enum class ObjectType {
-    kObject,
-    kArray,
-  };
-  ::std::vector<ObjectType> object_type_;
-
-  // Last field name.
-  ::std::string field_name_;
-  // Last field value.
-  ::std::string field_value_;
-};
 }  // namespace aos
 
 #endif  // AOS_JSON_TO_FLATBUFFER_H_

diff --git a/aos/json_tokenizer.cc b/aos/json_tokenizer.cc
new file mode 100644
index 0000000..38ff4e3
--- /dev/null
+++ b/aos/json_tokenizer.cc

@@ -0,0 +1,409 @@
+#include "aos/json_tokenizer.h"
+
+namespace aos {
+
+void Tokenizer::ConsumeWhitespace() {
+  while (true) {
+    if (AtEnd()) {
+      return;
+    }
+    // Skip any whitespace.
+    if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
+      ConsumeChar();
+    } else if (Char() == '\n') {
+      ConsumeChar();
+      ++linenumber_;
+    } else {
+      // There is no fail.  Once we are out of whitespace (including 0 of it),
+      // declare success.
+      return;
+    }
+  }
+}
+
+bool Tokenizer::Consume(const char *token) {
+  const absl::string_view original = data_;
+  while (true) {
+    // Finishing the token is success.
+    if (*token == '\0') {
+      return true;
+    }
+
+    // But finishing the data first is failure.
+    if (AtEnd()) {
+      data_ = original;
+      return false;
+    }
+
+    // Missmatch is failure.
+    if (*token != Char()) {
+      data_ = original;
+      return false;
+    }
+
+    ConsumeChar();
+    ++token;
+  }
+}
+
+bool Tokenizer::ConsumeString(::std::string *s) {
+  // Under no conditions is it acceptible to run out of data while parsing a
+  // string.  Any AtEnd checks should confirm that.
+  const absl::string_view original = data_;
+  if (AtEnd()) {
+    return false;
+  }
+
+  // Expect the leading "
+  if (Char() != '"') {
+    return false;
+  }
+
+  ConsumeChar();
+  absl::string_view last_parsed_data = data_;
+  *s = ::std::string();
+
+  while (true) {
+    if (AtEnd()) {
+      data_ = original;
+      return false;
+    }
+
+    // If we get an end or an escape, do something special.
+    if (Char() == '"' || Char() == '\\') {
+      // Save what we found up until now, not including this character.
+      *s += ::std::string(
+          last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
+
+      // Update the pointer.
+      last_parsed_data = data_;
+
+      // " is the end, declare victory.
+      if (Char() == '"') {
+        ConsumeChar();
+        return true;
+      } else {
+        ConsumeChar();
+        // Now consume valid escape characters and add their representation onto
+        // the output string.
+        if (AtEnd()) {
+          data_ = original;
+          return false;
+        } else if (Char() == '"') {
+          *s += "\"";
+        } else if (Char() == '\\') {
+          *s += "\\";
+        } else if (Char() == '/') {
+          *s += "/";
+        } else if (Char() == 'b') {
+          *s += "\b";
+        } else if (Char() == 'f') {
+          *s += "\f";
+        } else if (Char() == 'n') {
+          *s += "\n";
+        } else if (Char() == 'r') {
+          *s += "\r";
+        } else if (Char() == 't') {
+          *s += "\t";
+        } else if (Char() == 'u') {
+          // TODO(austin): Unicode should be valid, but I really don't care to
+          // do this now...
+          fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
+          data_ = original;
+          return false;
+        }
+      }
+      // And skip the escaped character.
+      last_parsed_data = data_.substr(1);
+    }
+
+    ConsumeChar();
+  }
+}
+
+bool Tokenizer::ConsumeNumber(::std::string *s) {
+  // Under no conditions is it acceptible to run out of data while parsing a
+  // number.  Any AtEnd() checks should confirm that.
+  *s = ::std::string();
+  const absl::string_view original = data_;
+
+  // Consume the leading - unconditionally.
+  Consume("-");
+
+  // Then, we either get a 0, or we get a nonzero.  Only nonzero can be followed
+  // by a second number.
+  if (!Consume("0")) {
+    if (AtEnd()) {
+      return false;
+    } else if (Char() >= '1' && Char() <= '9') {
+      // This wasn't a zero, but was a valid digit.  Consume it.
+      ConsumeChar();
+    } else {
+      return false;
+    }
+
+    // Now consume any number of any digits.
+    while (true) {
+      if (AtEnd()) {
+        data_ = original;
+        return false;
+      }
+      if (Char() < '0' || Char() > '9') {
+        break;
+      }
+      ConsumeChar();
+    }
+  }
+
+  // We could now have a decimal.
+  if (Char() == '.') {
+    ConsumeChar();
+    while (true) {
+      if (AtEnd()) {
+        data_ = original;
+        return false;
+      }
+      // And any number of digits.
+      if (Char() < '0' || Char() > '9') {
+        break;
+      }
+      ConsumeChar();
+    }
+  }
+
+  // And now an exponent.
+  if (Char() == 'e' || Char() == 'E') {
+    ConsumeChar();
+    if (AtEnd()) {
+      data_ = original;
+      return false;
+    }
+
+    // Which could have a +-
+    if (Char() == '+' || Char() == '-') {
+      ConsumeChar();
+    }
+    int count = 0;
+    while (true) {
+      if (AtEnd()) {
+        data_ = original;
+        return false;
+      }
+      // And digits.
+      if (Char() < '0' || Char() > '9') {
+        break;
+      }
+      ConsumeChar();
+      ++count;
+    }
+    // But, it is an error to have an exponent and nothing following it.
+    if (count == 0) {
+      data_ = original;
+      return false;
+    }
+  }
+
+  *s = ::std::string(original.substr(0, original.size() - data_.size()));
+  return true;
+}
+
+Tokenizer::TokenType Tokenizer::Next() {
+  switch (state_) {
+    case State::kExpectObjectStart:
+      // We should always start out with a {
+      if (!Consume("{")) return TokenType::kError;
+
+      // Document that we just started an object.
+      object_type_.push_back(ObjectType::kObject);
+
+      ConsumeWhitespace();
+
+      if (Consume("}")) {
+        ConsumeWhitespace();
+        state_ = State::kExpectObjectEnd;
+      } else {
+        state_ = State::kExpectField;
+      }
+      return TokenType::kStartObject;
+
+    case State::kExpectField: {
+      // Fields are built up of strings, whitespace, and then a : (followed by
+      // whitespace...)
+      ::std::string s;
+      if (!ConsumeString(&s)) {
+        fprintf(stderr, "Error on line %d, expected string for field name.\n",
+                linenumber_);
+        return TokenType::kError;
+      }
+      field_name_ = ::std::move(s);
+
+      ConsumeWhitespace();
+
+      if (!Consume(":")) {
+        fprintf(stderr, "Error on line %d\n", linenumber_);
+        return TokenType::kError;
+      }
+
+      ConsumeWhitespace();
+
+      state_ = State::kExpectValue;
+
+      return TokenType::kField;
+    } break;
+    case State::kExpectValue: {
+      TokenType result = TokenType::kError;
+
+      ::std::string s;
+      if (Consume("{")) {
+        // Fields are in objects.  Record and recurse.
+        object_type_.push_back(ObjectType::kObject);
+
+        ConsumeWhitespace();
+
+        state_ = State::kExpectField;
+        return TokenType::kStartObject;
+      } else if (Consume("[")) {
+        // Values are in arrays.  Record and recurse.
+        object_type_.push_back(ObjectType::kArray);
+
+        ConsumeWhitespace();
+        state_ = State::kExpectValue;
+        return TokenType::kStartArray;
+      } else if (ConsumeString(&s)) {
+        // Parsed as a string, grab it.
+        field_value_ = ::std::move(s);
+        result = TokenType::kStringValue;
+      } else if (ConsumeNumber(&s)) {
+        // Parsed as a number, grab it.
+        field_value_ = ::std::move(s);
+        result = TokenType::kNumberValue;
+      } else if (Consume("true")) {
+        // Parsed as a true, grab it.
+        field_value_ = "true";
+        result = TokenType::kTrueValue;
+      } else if (Consume("false")) {
+        // Parsed as a false, grab it.
+        field_value_ = "false";
+        result = TokenType::kFalseValue;
+      } else {
+        // Couldn't parse, so we have a syntax error.
+        fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
+      }
+
+      ConsumeWhitespace();
+
+      // After a field, we either have a , and another field (or value if we are
+      // in an array), or we should be closing out the object (or array).
+      if (Consume(",")) {
+        ConsumeWhitespace();
+        switch (object_type_.back()) {
+          case ObjectType::kObject:
+            state_ = State::kExpectField;
+            break;
+          case ObjectType::kArray:
+            state_ = State::kExpectValue;
+            break;
+        }
+      } else {
+        // Sanity check that the stack is deep enough.
+        if (object_type_.size() == 0) {
+          fprintf(stderr, "Error on line %d\n", linenumber_);
+          return TokenType::kError;
+        }
+
+        // And then require closing out the object.
+        switch (object_type_.back()) {
+          case ObjectType::kObject:
+            if (Consume("}")) {
+              ConsumeWhitespace();
+              state_ = State::kExpectObjectEnd;
+            } else {
+              return TokenType::kError;
+            }
+            break;
+          case ObjectType::kArray:
+            if (Consume("]")) {
+              ConsumeWhitespace();
+              state_ = State::kExpectArrayEnd;
+            } else {
+              return TokenType::kError;
+            }
+            break;
+        }
+      }
+      return result;
+    } break;
+
+    case State::kExpectArrayEnd:
+    case State::kExpectObjectEnd: {
+      const TokenType result = state_ == State::kExpectArrayEnd
+                                   ? TokenType::kEndArray
+                                   : TokenType::kEndObject;
+      // This is a transient state so we can send 2 tokens out in a row.  We
+      // discover the object or array end at the end of reading the value.
+      object_type_.pop_back();
+      if (object_type_.size() == 0) {
+        // We unwound the outer object.  We should send kEnd next.
+        state_ = State::kExpectEnd;
+      } else if (object_type_.back() == ObjectType::kObject) {
+        // If we are going into an object, it should either have another field
+        // or end.
+        if (Consume(",")) {
+          ConsumeWhitespace();
+          state_ = State::kExpectField;
+        } else if (Consume("}")) {
+          ConsumeWhitespace();
+          state_ = State::kExpectObjectEnd;
+        } else {
+          return TokenType::kError;
+        }
+      } else if (object_type_.back() == ObjectType::kArray) {
+        // If we are going into an array, it should either have another value
+        // or end.
+        if (Consume(",")) {
+          ConsumeWhitespace();
+          state_ = State::kExpectValue;
+        } else if (Consume("]")) {
+          ConsumeWhitespace();
+          state_ = State::kExpectArrayEnd;
+        } else {
+          return TokenType::kError;
+        }
+      }
+      // And then send out the correct token.
+      return result;
+    }
+    case State::kExpectEnd:
+      // If we are supposed to be done, confirm nothing is after the end.
+      if (AtEnd()) {
+        return TokenType::kEnd;
+      } else {
+        fprintf(stderr, "Data past end at line %d\n", linenumber_);
+        return TokenType::kError;
+      }
+  }
+  return TokenType::kError;
+}
+
+bool Tokenizer::FieldAsInt(long long *value) {
+  const char *pos = field_value().c_str();
+  errno = 0;
+  *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
+  if (pos != field_value().c_str() + field_value().size() || errno != 0) {
+    return false;
+  }
+  return true;
+}
+
+bool Tokenizer::FieldAsDouble(double *value) {
+  const char *pos = field_value().c_str();
+  errno = 0;
+  *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
+
+  if (pos != field_value().c_str() + field_value().size() || errno != 0) {
+    return false;
+  }
+  return true;
+}
+
+}  // namespace aos

diff --git a/aos/json_tokenizer.h b/aos/json_tokenizer.h
new file mode 100644
index 0000000..3058d7d
--- /dev/null
+++ b/aos/json_tokenizer.h

@@ -0,0 +1,109 @@
+#ifndef AOS_JSON_TOKENIZER_H_
+#define AOS_JSON_TOKENIZER_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+
+namespace aos {
+
+// This class implements the state machine at json.org
+//
+// The only modification is that it supports C++ comments /**/ in all
+// whitespace.
+class Tokenizer {
+ public:
+  Tokenizer(const absl::string_view data) : data_(data) {}
+
+  enum class TokenType {
+    kEnd,
+    kError,
+    kStartObject,
+    kEndObject,
+    kStartArray,
+    kEndArray,
+    kField,
+    kNumberValue,
+    kStringValue,
+    kTrueValue,
+    kFalseValue,
+  };
+
+  // Returns the next token.
+  TokenType Next();
+
+  // Returns the last field_name and field_value.  These are only valid when
+  // Next returns them.
+  const ::std::string &field_name() const { return field_name_; }
+  const ::std::string &field_value() const { return field_value_; }
+
+  // Parses the current field value as a long long.  Returns false if it failed
+  // to parse.
+  bool FieldAsInt(long long *value);
+  // Parses the current field value as a double.  Returns false if it failed
+  // to parse.
+  bool FieldAsDouble(double *value);
+
+  // Returns true if we are at the end of the input.
+  bool AtEnd() { return data_.size() == 0; }
+
+  const absl::string_view data_left() const { return data_; }
+
+ private:
+  // Consumes a single character.
+  void ConsumeChar() { data_ = data_.substr(1); }
+
+  // Returns the current character.
+  char Char() const { return data_[0]; }
+
+  // Consumes a string out of data_.  Populates s with the string.  Returns true
+  // if a valid string was found, and false otherwise.
+  // data_ is updated only on success.
+  bool ConsumeString(::std::string *s);
+  // Consumes a number out of data_.  Populates s with the string containing the
+  // number.  Returns true if a valid number was found, and false otherwise.
+  // data_ is updated only on success.
+  bool ConsumeNumber(::std::string *s);
+  // Consumes a fixed token out of data_. Returns true if the string was found,
+  // and false otherwise.
+  // data_ is updated only on success.
+  bool Consume(const char *token);
+  // Consumes whitespace out of data_. Returns true if the string was found,
+  // and false otherwise.
+  // data_ is unconditionally updated.
+  void ConsumeWhitespace();
+
+  // State for the parsing state machine.
+  enum class State {
+    kExpectField,
+    kExpectObjectStart,
+    kExpectObjectEnd,
+    kExpectArrayEnd,
+    kExpectValue,
+    kExpectEnd,
+  };
+
+  State state_ = State::kExpectObjectStart;
+
+  // Data pointer.
+  absl::string_view data_;
+  // Current line number used for printing debug.
+  int linenumber_ = 0;
+
+  // Stack used to track which object type we were in when we recursed.
+  enum class ObjectType {
+    kObject,
+    kArray,
+  };
+  ::std::vector<ObjectType> object_type_;
+
+  // Last field name.
+  ::std::string field_name_;
+  // Last field value.
+  ::std::string field_value_;
+};
+
+}  // namespace aos
+
+#endif  // AOS_JSON_TOKENIZER_H_
commit	d7e252dfc4c196a328d11ea4e10ae60a18920141	[log] [tgz]
author	Austin Schuh <austin.linux@gmail.com>	Sun Oct 06 13:51:02 2019 -0700
committer	Austin Schuh <austin.linux@gmail.com>	Sun Oct 06 13:59:24 2019 -0700
tree	1f4aca8aa21943ed2748d32a0c27bd8eba0a6f2f
parent	d339a9bc566bc721695e8110ad302367dbd27c7f [diff]