blob: 2486227a8003d1913c363913cc9c5445e3b6c6c5 [file] [log] [blame]
#include "aos/json_tokenizer.h"
#include <cerrno>
namespace aos {
void Tokenizer::ConsumeWhitespace() {
while (true) {
if (AtEnd()) {
return;
}
// Skip any whitespace.
if (Char() == ' ' || Char() == '\r' || Char() == '\t') {
ConsumeChar();
} else if (Char() == '\n') {
ConsumeChar();
++linenumber_;
} else if (Consume("/*")) {
while (!Consume("*/")) {
if (Char() == '\n') {
++linenumber_;
}
ConsumeChar();
}
} else {
// There is no fail. Once we are out of whitespace (including 0 of it),
// declare success.
return;
}
}
}
bool Tokenizer::Consume(const char *token) {
const std::string_view original = data_;
while (true) {
// Finishing the token is success.
if (*token == '\0') {
return true;
}
// But finishing the data first is failure.
if (AtEnd()) {
data_ = original;
return false;
}
// Missmatch is failure.
if (*token != Char()) {
data_ = original;
return false;
}
ConsumeChar();
++token;
}
}
bool Tokenizer::ConsumeString(::std::string *s) {
// Under no conditions is it acceptible to run out of data while parsing a
// string. Any AtEnd checks should confirm that.
const std::string_view original = data_;
if (AtEnd()) {
return false;
}
// Expect the leading "
if (Char() != '"') {
return false;
}
ConsumeChar();
std::string_view last_parsed_data = data_;
*s = ::std::string();
while (true) {
if (AtEnd()) {
data_ = original;
return false;
}
// If we get an end or an escape, do something special.
if (Char() == '"' || Char() == '\\') {
// Save what we found up until now, not including this character.
*s += ::std::string(
last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
// Update the pointer.
last_parsed_data = data_;
// " is the end, declare victory.
if (Char() == '"') {
ConsumeChar();
return true;
} else {
ConsumeChar();
// Now consume valid escape characters and add their representation onto
// the output string.
if (AtEnd()) {
data_ = original;
return false;
} else if (Char() == '"') {
*s += "\"";
} else if (Char() == '\\') {
*s += "\\";
} else if (Char() == '/') {
*s += "/";
} else if (Char() == 'b') {
*s += "\b";
} else if (Char() == 'f') {
*s += "\f";
} else if (Char() == 'n') {
*s += "\n";
} else if (Char() == 'r') {
*s += "\r";
} else if (Char() == 't') {
*s += "\t";
} else if (Char() == 'u') {
// TODO(austin): Unicode should be valid, but I really don't care to
// do this now...
fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
data_ = original;
return false;
}
}
// And skip the escaped character.
last_parsed_data = data_.substr(1);
}
ConsumeChar();
}
}
bool Tokenizer::ConsumeNumber(::std::string *s) {
// Under no conditions is it acceptible to run out of data while parsing a
// number. Any AtEnd() checks should confirm that.
*s = ::std::string();
const std::string_view original = data_;
// Consume the leading - unconditionally.
Consume("-");
// See if we find nan. This isn't standards compliant, but is what
// flatbuffers prints out, so we need to parse it.
if (Consume("nan")) {
*s = ::std::string(original.substr(0, original.size() - data_.size()));
return true;
}
// Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
// by a second number.
if (!Consume("0")) {
if (AtEnd()) {
return false;
} else if (Char() >= '1' && Char() <= '9') {
// This wasn't a zero, but was a valid digit. Consume it.
ConsumeChar();
} else {
return false;
}
// Now consume any number of any digits.
while (true) {
if (AtEnd()) {
data_ = original;
return false;
}
if (Char() < '0' || Char() > '9') {
break;
}
ConsumeChar();
}
}
// We could now have a decimal.
if (Char() == '.') {
ConsumeChar();
while (true) {
if (AtEnd()) {
data_ = original;
return false;
}
// And any number of digits.
if (Char() < '0' || Char() > '9') {
break;
}
ConsumeChar();
}
}
// And now an exponent.
if (Char() == 'e' || Char() == 'E') {
ConsumeChar();
if (AtEnd()) {
data_ = original;
return false;
}
// Which could have a +-
if (Char() == '+' || Char() == '-') {
ConsumeChar();
}
int count = 0;
while (true) {
if (AtEnd()) {
data_ = original;
return false;
}
// And digits.
if (Char() < '0' || Char() > '9') {
break;
}
ConsumeChar();
++count;
}
// But, it is an error to have an exponent and nothing following it.
if (count == 0) {
data_ = original;
return false;
}
}
*s = ::std::string(original.substr(0, original.size() - data_.size()));
return true;
}
Tokenizer::TokenType Tokenizer::Next() {
switch (state_) {
case State::kExpectObjectStart:
// We should always start out with a {
if (!Consume("{")) return TokenType::kError;
// Document that we just started an object.
object_type_.push_back(ObjectType::kObject);
ConsumeWhitespace();
if (Consume("}")) {
ConsumeWhitespace();
state_ = State::kExpectObjectEnd;
} else {
state_ = State::kExpectField;
}
return TokenType::kStartObject;
case State::kExpectField: {
// Fields are built up of strings, whitespace, and then a : (followed by
// whitespace...)
::std::string s;
if (!ConsumeString(&s)) {
fprintf(stderr, "Error on line %d, expected string for field name.\n",
linenumber_);
if (Consume("}")) {
fprintf(stderr,
"Got '}' instead. Did you add an extra trailing ','?\n");
}
return TokenType::kError;
}
field_name_ = ::std::move(s);
ConsumeWhitespace();
if (!Consume(":")) {
fprintf(stderr, "Error on line %d\n", linenumber_);
return TokenType::kError;
}
ConsumeWhitespace();
state_ = State::kExpectValue;
return TokenType::kField;
} break;
case State::kExpectValue: {
TokenType result = TokenType::kError;
::std::string s;
if (Consume("{")) {
// Fields are in objects. Record and recurse.
object_type_.push_back(ObjectType::kObject);
ConsumeWhitespace();
// And then if we encounter the end again, go to the end state.
if (Consume("}")) {
ConsumeWhitespace();
state_ = State::kExpectObjectEnd;
} else {
state_ = State::kExpectField;
}
return TokenType::kStartObject;
} else if (Consume("[")) {
// Values are in arrays. Record and recurse.
object_type_.push_back(ObjectType::kArray);
ConsumeWhitespace();
state_ = State::kExpectValue;
return TokenType::kStartArray;
} else if (ConsumeString(&s)) {
// Parsed as a string, grab it.
field_value_ = ::std::move(s);
result = TokenType::kStringValue;
} else if (ConsumeNumber(&s)) {
// Parsed as a number, grab it.
field_value_ = ::std::move(s);
result = TokenType::kNumberValue;
} else if (Consume("true")) {
// Parsed as a true, grab it.
field_value_ = "true";
result = TokenType::kTrueValue;
} else if (Consume("false")) {
// Parsed as a false, grab it.
field_value_ = "false";
result = TokenType::kFalseValue;
} else {
switch (object_type_.back()) {
case ObjectType::kObject:
if (Consume("}")) {
ConsumeWhitespace();
state_ = State::kExpectObjectEnd;
return Next();
}
break;
case ObjectType::kArray:
if (Consume("]")) {
ConsumeWhitespace();
state_ = State::kExpectArrayEnd;
return Next();
}
break;
}
// Couldn't parse, so we have a syntax error.
fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
}
ConsumeWhitespace();
// After a field, we either have a , and another field (or value if we are
// in an array), or we should be closing out the object (or array).
if (Consume(",")) {
ConsumeWhitespace();
switch (object_type_.back()) {
case ObjectType::kObject:
state_ = State::kExpectField;
break;
case ObjectType::kArray:
state_ = State::kExpectValue;
break;
}
} else {
// Sanity check that the stack is deep enough.
if (object_type_.size() == 0) {
fprintf(stderr, "Error on line %d\n", linenumber_);
return TokenType::kError;
}
// And then require closing out the object.
switch (object_type_.back()) {
case ObjectType::kObject:
if (Consume("}")) {
ConsumeWhitespace();
state_ = State::kExpectObjectEnd;
} else {
fprintf(stderr, "Error on line %d, expected } or ,\n", linenumber_);
return TokenType::kError;
}
break;
case ObjectType::kArray:
if (Consume("]")) {
ConsumeWhitespace();
state_ = State::kExpectArrayEnd;
} else {
fprintf(stderr, "Error on line %d, expected ] or ,\n", linenumber_);
return TokenType::kError;
}
break;
}
}
return result;
} break;
case State::kExpectArrayEnd:
case State::kExpectObjectEnd: {
const TokenType result = state_ == State::kExpectArrayEnd
? TokenType::kEndArray
: TokenType::kEndObject;
// This is a transient state so we can send 2 tokens out in a row. We
// discover the object or array end at the end of reading the value.
object_type_.pop_back();
if (object_type_.size() == 0) {
// We unwound the outer object. We should send kEnd next.
state_ = State::kExpectEnd;
} else if (object_type_.back() == ObjectType::kObject) {
// If we are going into an object, it should either have another field
// or end.
if (Consume(",")) {
ConsumeWhitespace();
state_ = State::kExpectField;
} else if (Consume("}")) {
ConsumeWhitespace();
state_ = State::kExpectObjectEnd;
} else {
fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
return TokenType::kError;
}
} else if (object_type_.back() == ObjectType::kArray) {
// If we are going into an array, it should either have another value
// or end.
if (Consume(",")) {
ConsumeWhitespace();
state_ = State::kExpectValue;
} else if (Consume("]")) {
ConsumeWhitespace();
state_ = State::kExpectArrayEnd;
} else {
fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
return TokenType::kError;
}
}
// And then send out the correct token.
return result;
}
case State::kExpectEnd:
// If we are supposed to be done, confirm nothing is after the end.
if (AtEnd()) {
return TokenType::kEnd;
} else {
fprintf(stderr, "Data past end at line %d\n", linenumber_);
return TokenType::kError;
}
}
return TokenType::kError;
}
bool Tokenizer::FieldAsInt(long long *value) {
const char *pos = field_value().c_str();
errno = 0;
*value = strtoll(field_value().c_str(), const_cast<char **>(&pos), 10);
if (pos != field_value().c_str() + field_value().size() || errno != 0) {
return false;
}
return true;
}
bool Tokenizer::FieldAsDouble(double *value) {
const char *pos = field_value().c_str();
errno = 0;
if (field_value() == "nan") {
*value = std::numeric_limits<double>::quiet_NaN();
return true;
} else if (field_value() == "-nan") {
*value = -std::numeric_limits<double>::quiet_NaN();
return true;
}
*value = strtod(field_value().c_str(), const_cast<char **>(&pos));
if (pos != field_value().c_str() + field_value().size() || errno != 0) {
return false;
}
return true;
}
} // namespace aos