aos/json_tokenizer.cc - RealtimeRoboticsGroup/test - Gitiles

 #include "aos/json_tokenizer.h"

 #include <cerrno>

 namespace aos {

 void Tokenizer::ConsumeWhitespace() {
   while (true) {
     if (AtEnd()) {
       return;
     }
     // Skip any whitespace.
     if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
       ConsumeChar();
     } else if (Char() == '\n') {
       ConsumeChar();
       ++linenumber_;
     } else if (Consume("/*")) {
       while (!Consume("*/")) {
         if (Char() == '\n') {
           ++linenumber_;
         }
         ConsumeChar();
       }
     } else {
       // There is no fail.  Once we are out of whitespace (including 0 of it),
       // declare success.
       return;
     }
   }
 }

 bool Tokenizer::Consume(const char *token) {
   const std::string_view original = data_;
   while (true) {
     // Finishing the token is success.
     if (*token == '\0') {
       return true;
     }

     // But finishing the data first is failure.
     if (AtEnd()) {
       data_ = original;
       return false;
     }

     // Missmatch is failure.
     if (*token != Char()) {
       data_ = original;
       return false;
     }

     ConsumeChar();
     ++token;
   }
 }

 bool Tokenizer::ConsumeString(::std::string *s) {
   // Under no conditions is it acceptible to run out of data while parsing a
   // string.  Any AtEnd checks should confirm that.
   const std::string_view original = data_;
   if (AtEnd()) {
     return false;
   }

   // Expect the leading "
   if (Char() != '"') {
     return false;
   }

   ConsumeChar();
   std::string_view last_parsed_data = data_;
   *s = ::std::string();

   while (true) {
     if (AtEnd()) {
       data_ = original;
       return false;
     }

     // If we get an end or an escape, do something special.
     if (Char() == '"' || Char() == '\\') {
       // Save what we found up until now, not including this character.
       *s += ::std::string(
           last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));

       // Update the pointer.
       last_parsed_data = data_;

       // " is the end, declare victory.
       if (Char() == '"') {
         ConsumeChar();
         return true;
       } else {
         ConsumeChar();
         // Now consume valid escape characters and add their representation onto
         // the output string.
         if (AtEnd()) {
           data_ = original;
           return false;
         } else if (Char() == '"') {
           *s += "\"";
         } else if (Char() == '\\') {
           *s += "\\";
         } else if (Char() == '/') {
           *s += "/";
         } else if (Char() == 'b') {
           *s += "\b";
         } else if (Char() == 'f') {
           *s += "\f";
         } else if (Char() == 'n') {
           *s += "\n";
         } else if (Char() == 'r') {
           *s += "\r";
         } else if (Char() == 't') {
           *s += "\t";
         } else if (Char() == 'u') {
           // TODO(austin): Unicode should be valid, but I really don't care to
           // do this now...
           fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
           data_ = original;
           return false;
         }
       }
       // And skip the escaped character.
       last_parsed_data = data_.substr(1);
     }

     ConsumeChar();
   }
 }

 bool Tokenizer::ConsumeNumber(::std::string *s) {
   // Under no conditions is it acceptible to run out of data while parsing a
   // number.  Any AtEnd() checks should confirm that.
   *s = ::std::string();
   const std::string_view original = data_;

   // Consume the leading - unconditionally.
   Consume("-");

   // See if we find nan.  This isn't standards compliant, but is what
   // flatbuffers prints out, so we need to parse it.
   if (Consume("nan")) {
     *s = ::std::string(original.substr(0, original.size() - data_.size()));
     return true;
   }

   // Then, we either get a 0, or we get a nonzero.  Only nonzero can be followed
   // by a second number.
   if (!Consume("0")) {
     if (AtEnd()) {
       return false;
     } else if (Char() >= '1' && Char() <= '9') {
       // This wasn't a zero, but was a valid digit.  Consume it.
       ConsumeChar();
     } else {
       return false;
     }

     // Now consume any number of any digits.
     while (true) {
       if (AtEnd()) {
         data_ = original;
         return false;
       }
       if (Char() < '0' || Char() > '9') {
         break;
       }
       ConsumeChar();
     }
   }

   // We could now have a decimal.
   if (Char() == '.') {
     ConsumeChar();
     while (true) {
       if (AtEnd()) {
         data_ = original;
         return false;
       }
       // And any number of digits.
       if (Char() < '0' || Char() > '9') {
         break;
       }
       ConsumeChar();
     }
   }

   // And now an exponent.
   if (Char() == 'e' || Char() == 'E') {
     ConsumeChar();
     if (AtEnd()) {
       data_ = original;
       return false;
     }

     // Which could have a +-
     if (Char() == '+' || Char() == '-') {
       ConsumeChar();
     }
     int count = 0;
     while (true) {
       if (AtEnd()) {
         data_ = original;
         return false;
       }
       // And digits.
       if (Char() < '0' || Char() > '9') {
         break;
       }
       ConsumeChar();
       ++count;
     }
     // But, it is an error to have an exponent and nothing following it.
     if (count == 0) {
       data_ = original;
       return false;
     }
   }

   *s = ::std::string(original.substr(0, original.size() - data_.size()));
   return true;
 }

 Tokenizer::TokenType Tokenizer::Next() {
   switch (state_) {
     case State::kExpectObjectStart:
       // We should always start out with a {
       if (!Consume("{")) return TokenType::kError;

       // Document that we just started an object.
       object_type_.push_back(ObjectType::kObject);

       ConsumeWhitespace();

       if (Consume("}")) {
         ConsumeWhitespace();
         state_ = State::kExpectObjectEnd;
       } else {
         state_ = State::kExpectField;
       }
       return TokenType::kStartObject;

     case State::kExpectField: {
       // Fields are built up of strings, whitespace, and then a : (followed by
       // whitespace...)
       ::std::string s;
       if (!ConsumeString(&s)) {
         fprintf(stderr, "Error on line %d, expected string for field name.\n",
                 linenumber_);
         if (Consume("}")) {
           fprintf(stderr,
                   "Got '}' instead.  Did you add an extra trailing ','?\n");
         }
         return TokenType::kError;
       }
       field_name_ = ::std::move(s);

       ConsumeWhitespace();

       if (!Consume(":")) {
         fprintf(stderr, "Error on line %d\n", linenumber_);
         return TokenType::kError;
       }

       ConsumeWhitespace();

       state_ = State::kExpectValue;

       return TokenType::kField;
     } break;
     case State::kExpectValue: {
       TokenType result = TokenType::kError;

       ::std::string s;
       if (Consume("{")) {
         // Fields are in objects.  Record and recurse.
         object_type_.push_back(ObjectType::kObject);

         ConsumeWhitespace();

         // And then if we encounter the end again, go to the end state.
         if (Consume("}")) {
           ConsumeWhitespace();
           state_ = State::kExpectObjectEnd;
         } else {
           state_ = State::kExpectField;
         }
         return TokenType::kStartObject;
       } else if (Consume("[")) {
         // Values are in arrays.  Record and recurse.
         object_type_.push_back(ObjectType::kArray);

         ConsumeWhitespace();
         state_ = State::kExpectValue;
         return TokenType::kStartArray;
       } else if (ConsumeString(&s)) {
         // Parsed as a string, grab it.
         field_value_ = ::std::move(s);
         result = TokenType::kStringValue;
       } else if (ConsumeNumber(&s)) {
         // Parsed as a number, grab it.
         field_value_ = ::std::move(s);
         result = TokenType::kNumberValue;
       } else if (Consume("true")) {
         // Parsed as a true, grab it.
         field_value_ = "true";
         result = TokenType::kTrueValue;
       } else if (Consume("false")) {
         // Parsed as a false, grab it.
         field_value_ = "false";
         result = TokenType::kFalseValue;
       } else {
         switch (object_type_.back()) {
           case ObjectType::kObject:
             if (Consume("}")) {
               ConsumeWhitespace();
               state_ = State::kExpectObjectEnd;
               return Next();
             }
             break;
           case ObjectType::kArray:
             if (Consume("]")) {
               ConsumeWhitespace();
               state_ = State::kExpectArrayEnd;
               return Next();
             }
             break;
         }
         // Couldn't parse, so we have a syntax error.
         fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
       }

       ConsumeWhitespace();

       // After a field, we either have a , and another field (or value if we are
       // in an array), or we should be closing out the object (or array).
       if (Consume(",")) {
         ConsumeWhitespace();
         switch (object_type_.back()) {
           case ObjectType::kObject:
             state_ = State::kExpectField;
             break;
           case ObjectType::kArray:
             state_ = State::kExpectValue;
             break;
         }
       } else {
         // Sanity check that the stack is deep enough.
         if (object_type_.size() == 0) {
           fprintf(stderr, "Error on line %d\n", linenumber_);
           return TokenType::kError;
         }

         // And then require closing out the object.
         switch (object_type_.back()) {
           case ObjectType::kObject:
             if (Consume("}")) {
               ConsumeWhitespace();
               state_ = State::kExpectObjectEnd;
             } else {
               fprintf(stderr, "Error on line %d, expected } or ,\n", linenumber_);
               return TokenType::kError;
             }
             break;
           case ObjectType::kArray:
             if (Consume("]")) {
               ConsumeWhitespace();
               state_ = State::kExpectArrayEnd;
             } else {
               fprintf(stderr, "Error on line %d, expected ] or ,\n", linenumber_);
               return TokenType::kError;
             }
             break;
         }
       }
       return result;
     } break;

     case State::kExpectArrayEnd:
     case State::kExpectObjectEnd: {
       const TokenType result = state_ == State::kExpectArrayEnd
                                    ? TokenType::kEndArray
                                    : TokenType::kEndObject;
       // This is a transient state so we can send 2 tokens out in a row.  We
       // discover the object or array end at the end of reading the value.
       object_type_.pop_back();
       if (object_type_.size() == 0) {
         // We unwound the outer object.  We should send kEnd next.
         state_ = State::kExpectEnd;
       } else if (object_type_.back() == ObjectType::kObject) {
         // If we are going into an object, it should either have another field
         // or end.
         if (Consume(",")) {
           ConsumeWhitespace();
           state_ = State::kExpectField;
         } else if (Consume("}")) {
           ConsumeWhitespace();
           state_ = State::kExpectObjectEnd;
         } else {
           fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
           return TokenType::kError;
         }
       } else if (object_type_.back() == ObjectType::kArray) {
         // If we are going into an array, it should either have another value
         // or end.
         if (Consume(",")) {
           ConsumeWhitespace();
           state_ = State::kExpectValue;
         } else if (Consume("]")) {
           ConsumeWhitespace();
           state_ = State::kExpectArrayEnd;
         } else {
           fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
           return TokenType::kError;
         }
       }
       // And then send out the correct token.
       return result;
     }
     case State::kExpectEnd:
       // If we are supposed to be done, confirm nothing is after the end.
       if (AtEnd()) {
         return TokenType::kEnd;
       } else {
         fprintf(stderr, "Data past end at line %d\n", linenumber_);
         return TokenType::kError;
       }
   }
   return TokenType::kError;
 }

 bool Tokenizer::FieldAsInt(long long *value) {
   const char *pos = field_value().c_str();
   errno = 0;
   *value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
   if (pos != field_value().c_str() + field_value().size() || errno != 0) {
     return false;
   }
   return true;
 }

 bool Tokenizer::FieldAsDouble(double *value) {
   const char *pos = field_value().c_str();
   errno = 0;
   if (field_value() == "nan") {
     *value = std::numeric_limits<double>::quiet_NaN();
     return true;
   } else if (field_value() == "-nan") {
     *value = -std::numeric_limits<double>::quiet_NaN();
     return true;
   }

   *value = strtod(field_value().c_str(), const_cast<char **>(&pos));

   if (pos != field_value().c_str() + field_value().size() || errno != 0) {
     return false;
   }
   return true;
 }

 }  // namespace aos
	#include "aos/json_tokenizer.h"

	#include <cerrno>

	namespace aos {

	void Tokenizer::ConsumeWhitespace() {
	while (true) {
	if (AtEnd()) {
	return;
	}
	// Skip any whitespace.
	if (Char() == ' ' \|\| Char() == '\r' \|\| Char() == '\t') {
	ConsumeChar();
	} else if (Char() == '\n') {
	ConsumeChar();
	++linenumber_;
	} else if (Consume("/*")) {
	while (!Consume("*/")) {
	if (Char() == '\n') {
	++linenumber_;
	}
	ConsumeChar();
	}
	} else {
	// There is no fail. Once we are out of whitespace (including 0 of it),
	// declare success.
	return;
	}
	}
	}

	bool Tokenizer::Consume(const char *token) {
	const std::string_view original = data_;
	while (true) {
	// Finishing the token is success.
	if (*token == '\0') {
	return true;
	}

	// But finishing the data first is failure.
	if (AtEnd()) {
	data_ = original;
	return false;
	}

	// Missmatch is failure.
	if (*token != Char()) {
	data_ = original;
	return false;
	}

	ConsumeChar();
	++token;
	}
	}

	bool Tokenizer::ConsumeString(::std::string *s) {
	// Under no conditions is it acceptible to run out of data while parsing a
	// string. Any AtEnd checks should confirm that.
	const std::string_view original = data_;
	if (AtEnd()) {
	return false;
	}

	// Expect the leading "
	if (Char() != '"') {
	return false;
	}

	ConsumeChar();
	std::string_view last_parsed_data = data_;
	*s = ::std::string();

	while (true) {
	if (AtEnd()) {
	data_ = original;
	return false;
	}

	// If we get an end or an escape, do something special.
	if (Char() == '"' \|\| Char() == '\\') {
	// Save what we found up until now, not including this character.
	*s += ::std::string(
	last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));

	// Update the pointer.
	last_parsed_data = data_;

	// " is the end, declare victory.
	if (Char() == '"') {
	ConsumeChar();
	return true;
	} else {
	ConsumeChar();
	// Now consume valid escape characters and add their representation onto
	// the output string.
	if (AtEnd()) {
	data_ = original;
	return false;
	} else if (Char() == '"') {
	*s += "\"";
	} else if (Char() == '\\') {
	*s += "\\";
	} else if (Char() == '/') {
	*s += "/";
	} else if (Char() == 'b') {
	*s += "\b";
	} else if (Char() == 'f') {
	*s += "\f";
	} else if (Char() == 'n') {
	*s += "\n";
	} else if (Char() == 'r') {
	*s += "\r";
	} else if (Char() == 't') {
	*s += "\t";
	} else if (Char() == 'u') {
	// TODO(austin): Unicode should be valid, but I really don't care to
	// do this now...
	fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
	data_ = original;
	return false;
	}
	}
	// And skip the escaped character.
	last_parsed_data = data_.substr(1);
	}

	ConsumeChar();
	}
	}

	bool Tokenizer::ConsumeNumber(::std::string *s) {
	// Under no conditions is it acceptible to run out of data while parsing a
	// number. Any AtEnd() checks should confirm that.
	*s = ::std::string();
	const std::string_view original = data_;

	// Consume the leading - unconditionally.
	Consume("-");

	// See if we find nan. This isn't standards compliant, but is what
	// flatbuffers prints out, so we need to parse it.
	if (Consume("nan")) {
	*s = ::std::string(original.substr(0, original.size() - data_.size()));
	return true;
	}

	// Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
	// by a second number.
	if (!Consume("0")) {
	if (AtEnd()) {
	return false;
	} else if (Char() >= '1' && Char() <= '9') {
	// This wasn't a zero, but was a valid digit. Consume it.
	ConsumeChar();
	} else {
	return false;
	}

	// Now consume any number of any digits.
	while (true) {
	if (AtEnd()) {
	data_ = original;
	return false;
	}
	if (Char() < '0' \|\| Char() > '9') {
	break;
	}
	ConsumeChar();
	}
	}

	// We could now have a decimal.
	if (Char() == '.') {
	ConsumeChar();
	while (true) {
	if (AtEnd()) {
	data_ = original;
	return false;
	}
	// And any number of digits.
	if (Char() < '0' \|\| Char() > '9') {
	break;
	}
	ConsumeChar();
	}
	}

	// And now an exponent.
	if (Char() == 'e' \|\| Char() == 'E') {
	ConsumeChar();
	if (AtEnd()) {
	data_ = original;
	return false;
	}

	// Which could have a +-
	if (Char() == '+' \|\| Char() == '-') {
	ConsumeChar();
	}
	int count = 0;
	while (true) {
	if (AtEnd()) {
	data_ = original;
	return false;
	}
	// And digits.
	if (Char() < '0' \|\| Char() > '9') {
	break;
	}
	ConsumeChar();
	++count;
	}
	// But, it is an error to have an exponent and nothing following it.
	if (count == 0) {
	data_ = original;
	return false;
	}
	}

	*s = ::std::string(original.substr(0, original.size() - data_.size()));
	return true;
	}

	Tokenizer::TokenType Tokenizer::Next() {
	switch (state_) {
	case State::kExpectObjectStart:
	// We should always start out with a {
	if (!Consume("{")) return TokenType::kError;

	// Document that we just started an object.
	object_type_.push_back(ObjectType::kObject);

	ConsumeWhitespace();

	if (Consume("}")) {
	ConsumeWhitespace();
	state_ = State::kExpectObjectEnd;
	} else {
	state_ = State::kExpectField;
	}
	return TokenType::kStartObject;

	case State::kExpectField: {
	// Fields are built up of strings, whitespace, and then a : (followed by
	// whitespace...)
	::std::string s;
	if (!ConsumeString(&s)) {
	fprintf(stderr, "Error on line %d, expected string for field name.\n",
	linenumber_);
	if (Consume("}")) {
	fprintf(stderr,
	"Got '}' instead. Did you add an extra trailing ','?\n");
	}
	return TokenType::kError;
	}
	field_name_ = ::std::move(s);

	ConsumeWhitespace();

	if (!Consume(":")) {
	fprintf(stderr, "Error on line %d\n", linenumber_);
	return TokenType::kError;
	}

	ConsumeWhitespace();

	state_ = State::kExpectValue;

	return TokenType::kField;
	} break;
	case State::kExpectValue: {
	TokenType result = TokenType::kError;

	::std::string s;
	if (Consume("{")) {
	// Fields are in objects. Record and recurse.
	object_type_.push_back(ObjectType::kObject);

	ConsumeWhitespace();

	// And then if we encounter the end again, go to the end state.
	if (Consume("}")) {
	ConsumeWhitespace();
	state_ = State::kExpectObjectEnd;
	} else {
	state_ = State::kExpectField;
	}
	return TokenType::kStartObject;
	} else if (Consume("[")) {
	// Values are in arrays. Record and recurse.
	object_type_.push_back(ObjectType::kArray);

	ConsumeWhitespace();
	state_ = State::kExpectValue;
	return TokenType::kStartArray;
	} else if (ConsumeString(&s)) {
	// Parsed as a string, grab it.
	field_value_ = ::std::move(s);
	result = TokenType::kStringValue;
	} else if (ConsumeNumber(&s)) {
	// Parsed as a number, grab it.
	field_value_ = ::std::move(s);
	result = TokenType::kNumberValue;
	} else if (Consume("true")) {
	// Parsed as a true, grab it.
	field_value_ = "true";
	result = TokenType::kTrueValue;
	} else if (Consume("false")) {
	// Parsed as a false, grab it.
	field_value_ = "false";
	result = TokenType::kFalseValue;
	} else {
	switch (object_type_.back()) {
	case ObjectType::kObject:
	if (Consume("}")) {
	ConsumeWhitespace();
	state_ = State::kExpectObjectEnd;
	return Next();
	}
	break;
	case ObjectType::kArray:
	if (Consume("]")) {
	ConsumeWhitespace();
	state_ = State::kExpectArrayEnd;
	return Next();
	}
	break;
	}
	// Couldn't parse, so we have a syntax error.
	fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
	}

	ConsumeWhitespace();

	// After a field, we either have a , and another field (or value if we are
	// in an array), or we should be closing out the object (or array).
	if (Consume(",")) {
	ConsumeWhitespace();
	switch (object_type_.back()) {
	case ObjectType::kObject:
	state_ = State::kExpectField;
	break;
	case ObjectType::kArray:
	state_ = State::kExpectValue;
	break;
	}
	} else {
	// Sanity check that the stack is deep enough.
	if (object_type_.size() == 0) {
	fprintf(stderr, "Error on line %d\n", linenumber_);
	return TokenType::kError;
	}

	// And then require closing out the object.
	switch (object_type_.back()) {
	case ObjectType::kObject:
	if (Consume("}")) {
	ConsumeWhitespace();
	state_ = State::kExpectObjectEnd;
	} else {
	fprintf(stderr, "Error on line %d, expected } or ,\n", linenumber_);
	return TokenType::kError;
	}
	break;
	case ObjectType::kArray:
	if (Consume("]")) {
	ConsumeWhitespace();
	state_ = State::kExpectArrayEnd;
	} else {
	fprintf(stderr, "Error on line %d, expected ] or ,\n", linenumber_);
	return TokenType::kError;
	}
	break;
	}
	}
	return result;
	} break;

	case State::kExpectArrayEnd:
	case State::kExpectObjectEnd: {
	const TokenType result = state_ == State::kExpectArrayEnd
	? TokenType::kEndArray
	: TokenType::kEndObject;
	// This is a transient state so we can send 2 tokens out in a row. We
	// discover the object or array end at the end of reading the value.
	object_type_.pop_back();
	if (object_type_.size() == 0) {
	// We unwound the outer object. We should send kEnd next.
	state_ = State::kExpectEnd;
	} else if (object_type_.back() == ObjectType::kObject) {
	// If we are going into an object, it should either have another field
	// or end.
	if (Consume(",")) {
	ConsumeWhitespace();
	state_ = State::kExpectField;
	} else if (Consume("}")) {
	ConsumeWhitespace();
	state_ = State::kExpectObjectEnd;
	} else {
	fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
	return TokenType::kError;
	}
	} else if (object_type_.back() == ObjectType::kArray) {
	// If we are going into an array, it should either have another value
	// or end.
	if (Consume(",")) {
	ConsumeWhitespace();
	state_ = State::kExpectValue;
	} else if (Consume("]")) {
	ConsumeWhitespace();
	state_ = State::kExpectArrayEnd;
	} else {
	fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
	return TokenType::kError;
	}
	}
	// And then send out the correct token.
	return result;
	}
	case State::kExpectEnd:
	// If we are supposed to be done, confirm nothing is after the end.
	if (AtEnd()) {
	return TokenType::kEnd;
	} else {
	fprintf(stderr, "Data past end at line %d\n", linenumber_);
	return TokenType::kError;
	}
	}
	return TokenType::kError;
	}

	bool Tokenizer::FieldAsInt(long long *value) {
	const char *pos = field_value().c_str();
	errno = 0;
	value = strtoll(field_value().c_str(), const_cast<char *>(&pos), 10);
	if (pos != field_value().c_str() + field_value().size() \|\| errno != 0) {
	return false;
	}
	return true;
	}

	bool Tokenizer::FieldAsDouble(double *value) {
	const char *pos = field_value().c_str();
	errno = 0;
	if (field_value() == "nan") {
	*value = std::numeric_limits<double>::quiet_NaN();
	return true;
	} else if (field_value() == "-nan") {
	*value = -std::numeric_limits<double>::quiet_NaN();
	return true;
	}

	value = strtod(field_value().c_str(), const_cast<char *>(&pos));

	if (pos != field_value().c_str() + field_value().size() \|\| errno != 0) {
	return false;
	}
	return true;
	}

	} // namespace aos