Factor tokenizer out to a separate file.
json_to_flatbuffer.{cc,h} was getting too big.
Change-Id: I38c10001b15da7fa2c0bfef66be5e640d1fa446f
diff --git a/aos/json_to_flatbuffer.cc b/aos/json_to_flatbuffer.cc
index 68d3b03..85e2d7f 100644
--- a/aos/json_to_flatbuffer.cc
+++ b/aos/json_to_flatbuffer.cc
@@ -6,6 +6,7 @@
#include "absl/strings/string_view.h"
#include "aos/flatbuffer_utils.h"
#include "aos/logging/logging.h"
+#include "aos/json_tokenizer.h"
#include "flatbuffers/flatbuffers.h"
#include "flatbuffers/minireflect.h"
@@ -729,408 +730,4 @@
return tostring_visitor.s;
}
-void Tokenizer::ConsumeWhitespace() {
- while (true) {
- if (AtEnd()) {
- return;
- }
- // Skip any whitespace.
- if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
- ConsumeChar();
- } else if (Char() == '\n') {
- ConsumeChar();
- ++linenumber_;
- } else {
- // There is no fail. Once we are out of whitespace (including 0 of it),
- // declare success.
- return;
- }
- }
-}
-
-bool Tokenizer::Consume(const char *token) {
- const absl::string_view original = data_;
- while (true) {
- // Finishing the token is success.
- if (*token == '\0') {
- return true;
- }
-
- // But finishing the data first is failure.
- if (AtEnd()) {
- data_ = original;
- return false;
- }
-
- // Missmatch is failure.
- if (*token != Char()) {
- data_ = original;
- return false;
- }
-
- ConsumeChar();
- ++token;
- }
-}
-
-bool Tokenizer::ConsumeString(::std::string *s) {
- // Under no conditions is it acceptible to run out of data while parsing a
- // string. Any AtEnd checks should confirm that.
- const absl::string_view original = data_;
- if (AtEnd()) {
- return false;
- }
-
- // Expect the leading "
- if (Char() != '"') {
- return false;
- }
-
- ConsumeChar();
- absl::string_view last_parsed_data = data_;
- *s = ::std::string();
-
- while (true) {
- if (AtEnd()) {
- data_ = original;
- return false;
- }
-
- // If we get an end or an escape, do something special.
- if (Char() == '"' || Char() == '\\') {
- // Save what we found up until now, not including this character.
- *s += ::std::string(
- last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
-
- // Update the pointer.
- last_parsed_data = data_;
-
- // " is the end, declare victory.
- if (Char() == '"') {
- ConsumeChar();
- return true;
- } else {
- ConsumeChar();
- // Now consume valid escape characters and add their representation onto
- // the output string.
- if (AtEnd()) {
- data_ = original;
- return false;
- } else if (Char() == '"') {
- *s += "\"";
- } else if (Char() == '\\') {
- *s += "\\";
- } else if (Char() == '/') {
- *s += "/";
- } else if (Char() == 'b') {
- *s += "\b";
- } else if (Char() == 'f') {
- *s += "\f";
- } else if (Char() == 'n') {
- *s += "\n";
- } else if (Char() == 'r') {
- *s += "\r";
- } else if (Char() == 't') {
- *s += "\t";
- } else if (Char() == 'u') {
- // TODO(austin): Unicode should be valid, but I really don't care to
- // do this now...
- fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
- data_ = original;
- return false;
- }
- }
- // And skip the escaped character.
- last_parsed_data = data_.substr(1);
- }
-
- ConsumeChar();
- }
-}
-
-bool Tokenizer::ConsumeNumber(::std::string *s) {
- // Under no conditions is it acceptible to run out of data while parsing a
- // number. Any AtEnd() checks should confirm that.
- *s = ::std::string();
- const absl::string_view original = data_;
-
- // Consume the leading - unconditionally.
- Consume("-");
-
- // Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
- // by a second number.
- if (!Consume("0")) {
- if (AtEnd()) {
- return false;
- } else if (Char() >= '1' && Char() <= '9') {
- // This wasn't a zero, but was a valid digit. Consume it.
- ConsumeChar();
- } else {
- return false;
- }
-
- // Now consume any number of any digits.
- while (true) {
- if (AtEnd()) {
- data_ = original;
- return false;
- }
- if (Char() < '0' || Char() > '9') {
- break;
- }
- ConsumeChar();
- }
- }
-
- // We could now have a decimal.
- if (Char() == '.') {
- ConsumeChar();
- while (true) {
- if (AtEnd()) {
- data_ = original;
- return false;
- }
- // And any number of digits.
- if (Char() < '0' || Char() > '9') {
- break;
- }
- ConsumeChar();
- }
- }
-
- // And now an exponent.
- if (Char() == 'e' || Char() == 'E') {
- ConsumeChar();
- if (AtEnd()) {
- data_ = original;
- return false;
- }
-
- // Which could have a +-
- if (Char() == '+' || Char() == '-') {
- ConsumeChar();
- }
- int count = 0;
- while (true) {
- if (AtEnd()) {
- data_ = original;
- return false;
- }
- // And digits.
- if (Char() < '0' || Char() > '9') {
- break;
- }
- ConsumeChar();
- ++count;
- }
- // But, it is an error to have an exponent and nothing following it.
- if (count == 0) {
- data_ = original;
- return false;
- }
- }
-
- *s = ::std::string(original.substr(0, original.size() - data_.size()));
- return true;
-}
-
-Tokenizer::TokenType Tokenizer::Next() {
- switch (state_) {
- case State::kExpectObjectStart:
- // We should always start out with a {
- if (!Consume("{")) return TokenType::kError;
-
- // Document that we just started an object.
- object_type_.push_back(ObjectType::kObject);
-
- ConsumeWhitespace();
-
- if (Consume("}")) {
- ConsumeWhitespace();
- state_ = State::kExpectObjectEnd;
- } else {
- state_ = State::kExpectField;
- }
- return TokenType::kStartObject;
-
- case State::kExpectField: {
- // Fields are built up of strings, whitespace, and then a : (followed by
- // whitespace...)
- ::std::string s;
- if (!ConsumeString(&s)) {
- fprintf(stderr, "Error on line %d, expected string for field name.\n",
- linenumber_);
- return TokenType::kError;
- }
- field_name_ = ::std::move(s);
-
- ConsumeWhitespace();
-
- if (!Consume(":")) {
- fprintf(stderr, "Error on line %d\n", linenumber_);
- return TokenType::kError;
- }
-
- ConsumeWhitespace();
-
- state_ = State::kExpectValue;
-
- return TokenType::kField;
- } break;
- case State::kExpectValue: {
- TokenType result = TokenType::kError;
-
- ::std::string s;
- if (Consume("{")) {
- // Fields are in objects. Record and recurse.
- object_type_.push_back(ObjectType::kObject);
-
- ConsumeWhitespace();
-
- state_ = State::kExpectField;
- return TokenType::kStartObject;
- } else if (Consume("[")) {
- // Values are in arrays. Record and recurse.
- object_type_.push_back(ObjectType::kArray);
-
- ConsumeWhitespace();
- state_ = State::kExpectValue;
- return TokenType::kStartArray;
- } else if (ConsumeString(&s)) {
- // Parsed as a string, grab it.
- field_value_ = ::std::move(s);
- result = TokenType::kStringValue;
- } else if (ConsumeNumber(&s)) {
- // Parsed as a number, grab it.
- field_value_ = ::std::move(s);
- result = TokenType::kNumberValue;
- } else if (Consume("true")) {
- // Parsed as a true, grab it.
- field_value_ = "true";
- result = TokenType::kTrueValue;
- } else if (Consume("false")) {
- // Parsed as a false, grab it.
- field_value_ = "false";
- result = TokenType::kFalseValue;
- } else {
- // Couldn't parse, so we have a syntax error.
- fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
- }
-
- ConsumeWhitespace();
-
- // After a field, we either have a , and another field (or value if we are
- // in an array), or we should be closing out the object (or array).
- if (Consume(",")) {
- ConsumeWhitespace();
- switch (object_type_.back()) {
- case ObjectType::kObject:
- state_ = State::kExpectField;
- break;
- case ObjectType::kArray:
- state_ = State::kExpectValue;
- break;
- }
- } else {
- // Sanity check that the stack is deep enough.
- if (object_type_.size() == 0) {
- fprintf(stderr, "Error on line %d\n", linenumber_);
- return TokenType::kError;
- }
-
- // And then require closing out the object.
- switch (object_type_.back()) {
- case ObjectType::kObject:
- if (Consume("}")) {
- ConsumeWhitespace();
- state_ = State::kExpectObjectEnd;
- } else {
- return TokenType::kError;
- }
- break;
- case ObjectType::kArray:
- if (Consume("]")) {
- ConsumeWhitespace();
- state_ = State::kExpectArrayEnd;
- } else {
- return TokenType::kError;
- }
- break;
- }
- }
- return result;
- } break;
-
- case State::kExpectArrayEnd:
- case State::kExpectObjectEnd: {
- const TokenType result = state_ == State::kExpectArrayEnd
- ? TokenType::kEndArray
- : TokenType::kEndObject;
- // This is a transient state so we can send 2 tokens out in a row. We
- // discover the object or array end at the end of reading the value.
- object_type_.pop_back();
- if (object_type_.size() == 0) {
- // We unwound the outer object. We should send kEnd next.
- state_ = State::kExpectEnd;
- } else if (object_type_.back() == ObjectType::kObject) {
- // If we are going into an object, it should either have another field
- // or end.
- if (Consume(",")) {
- ConsumeWhitespace();
- state_ = State::kExpectField;
- } else if (Consume("}")) {
- ConsumeWhitespace();
- state_ = State::kExpectObjectEnd;
- } else {
- return TokenType::kError;
- }
- } else if (object_type_.back() == ObjectType::kArray) {
- // If we are going into an array, it should either have another value
- // or end.
- if (Consume(",")) {
- ConsumeWhitespace();
- state_ = State::kExpectValue;
- } else if (Consume("]")) {
- ConsumeWhitespace();
- state_ = State::kExpectArrayEnd;
- } else {
- return TokenType::kError;
- }
- }
- // And then send out the correct token.
- return result;
- }
- case State::kExpectEnd:
- // If we are supposed to be done, confirm nothing is after the end.
- if (AtEnd()) {
- return TokenType::kEnd;
- } else {
- fprintf(stderr, "Data past end at line %d\n", linenumber_);
- return TokenType::kError;
- }
- }
- return TokenType::kError;
-}
-
-bool Tokenizer::FieldAsInt(long long *value) {
- const char *pos = field_value().c_str();
- errno = 0;
- *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
- if (pos != field_value().c_str() + field_value().size() || errno != 0) {
- return false;
- }
- return true;
-}
-
-bool Tokenizer::FieldAsDouble(double *value) {
- const char *pos = field_value().c_str();
- errno = 0;
- *value = strtod(field_value().c_str(), const_cast<char **>(&pos));
-
- if (pos != field_value().c_str() + field_value().size() || errno != 0) {
- return false;
- }
- return true;
-}
-
} // namespace aos