blob: 09b1e4567d24869a9e7035e1a6da6bb4de616c15 [file] [log] [blame]
#include "jsont.hh"
namespace jsont {
static const int8_t kHexValueTable[55] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 0-0
-1, -1, -1, -1, -1, -1, -1,
10, 11, 12, 13, 14, 15, // A-F
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1,
10, 11, 12, 13, 14, 15 // a-f
};
static uint64_t _xtou64(const uint8_t* bytes, size_t len) {
uint64_t value = 0;
uint64_t cutoff = UINT64_MAX / 16;
int cutoff_digit = (int)(UINT64_MAX - cutoff * 16);
for (size_t i = 0; i != len; ++i) {
uint8_t b = bytes[i];
int8_t digit = (b > '0'-1 && b < 'f'+1) ? kHexValueTable[b-'0'] : -1;
if (b == -1 || // bad digit
(value > cutoff) || // overflow
((value == cutoff) && (digit > cutoff_digit)) ) {
return UINT64_MAX;
} else {
value = (value * 16) + digit;
}
}
return value;
}
#ifdef NAN
#define _JSONT_NAN NAN
#else
#define _JSONT_NAN nan(0)
#endif
const char* token_name(jsont::Token tok) {
switch (tok) {
case End: return "End";
case ObjectStart: return "ObjectStart";
case ObjectEnd: return "ObjectEnd";
case ArrayStart: return "ArrayStart";
case ArrayEnd: return "ArrayEnd";
case True: return "True";
case False: return "False";
case Null: return "Null";
case Integer: return "Integer";
case Float: return "Float";
case String: return "String";
case FieldName: return "FieldName";
default: return "?";
}
}
class TokenizerInternal {
public:
inline static const uint8_t* currentInput(const Tokenizer& self) {
return self._input.bytes + self._input.offset;
}
inline static const Token& readAtom(Tokenizer& self, const char* str,
size_t len, const Token& token) {
if (self.availableInput() < len) {
return self.setError(Tokenizer::PrematureEndOfInput);
} else if (memcmp(currentInput(self), str, len) != 0) {
return self.setError(Tokenizer::InvalidByte);
} else {
self._input.offset += len;
return self.setToken(token);
}
}
};
Tokenizer::~Tokenizer() {}
void Tokenizer::reset(const char* bytes, size_t length, TextEncoding encoding) {
assert(encoding == UTF8TextEncoding); // only supported encoding
_input.bytes = (const uint8_t*)bytes;
_input.length = length;
_input.offset = 0;
_error.code = UnspecifiedError;
// Advance to first token
next();
}
const char* Tokenizer::errorMessage() const {
switch (_error.code) {
case UnexpectedComma:
return "Unexpected comma";
case UnexpectedTrailingComma:
return "Unexpected trailing comma";
case InvalidByte:
return "Invalid input byte";
case PrematureEndOfInput:
return "Premature end of input";
case MalformedUnicodeEscapeSequence:
return "Malformed Unicode escape sequence";
case MalformedNumberLiteral:
return "Malformed number literal";
case UnterminatedString:
return "Unterminated string";
case SyntaxError:
return "Illegal JSON (syntax error)";
default:
return "Unspecified error";
}
}
size_t Tokenizer::dataValue(const char const** bytes) const {
if (!hasValue()) { return 0; }
if (_value.buffered) {
*bytes = (const char const*)_value.buffer.data();
return _value.buffer.size();
} else {
*bytes = (const char const*)(_input.bytes + _value.offset);
return _value.length;
}
}
double Tokenizer::floatValue() const {
if (!hasValue()) {
return _token == jsont::True ? 1.0 : 0.0;
}
const char* bytes;
if (_value.buffered) {
// edge-case since only happens with string values using escape sequences
bytes = _value.buffer.c_str();
} else {
bytes = (const char*)_input.bytes + _value.offset;
if (availableInput() == 0) {
// In this case where the data lies at the edge of the buffer, we can't pass
// it directly to atof, since there will be no sentinel byte. We are fine
// with a copy, since this is an edge case (only happens either for broken
// JSON or when the whole document is just a number).
char* buf[128];
if (_value.length > 127) {
// We are unable to interpret such a large literal in this edge-case
return _JSONT_NAN;
}
memcpy((void*)buf, (const void*)bytes, _value.length);
buf[_value.length] = '\0';
return strtod((const char*)buf, (char**)0);
}
}
return strtod(bytes, (char**)0);
}
int64_t Tokenizer::intValue() const {
if (!hasValue()) {
return _token == jsont::True ? 1LL : 0LL;
}
const char* bytes;
if (_value.buffered) {
// edge-case since only happens with string values using escape sequences
bytes = _value.buffer.c_str();
} else {
bytes = (const char*)_input.bytes + _value.offset;
if (availableInput() == 0) {
// In this case where the data lies at the edge of the buffer, we can't pass
// it directly to atof, since there will be no sentinel byte. We are fine
// with a copy, since this is an edge case (only happens either for broken
// JSON or when the whole document is just a number).
char* buf[21];
if (_value.length > 20) {
// We are unable to interpret such a large literal in this edge-case
return 0;
}
memcpy((void*)buf, (const void*)bytes, _value.length);
buf[_value.length] = '\0';
return strtoll((const char*)buf, (char**)0, 10);
}
}
return strtoll(bytes, (char**)0, 10);
}
const Token& Tokenizer::next() {
//
// { } [ ] n t f "
// | | | |
// | | | +- /[^"]*/ "
// | | +- a l s e
// | +- r u e
// +- u l l
//
while (!endOfInput()) {
uint8_t b = _input.bytes[_input.offset++];
switch (b) {
case '{': return setToken(ObjectStart);
case '}': {
if (_token == _Comma) { return setError(UnexpectedTrailingComma); }
return setToken(ObjectEnd);
}
case '[': return setToken(ArrayStart);
case ']': {
if (_token == _Comma) { return setError(UnexpectedTrailingComma); }
return setToken(ArrayEnd);
}
case 'n':
return TokenizerInternal::readAtom(*this, "ull", 3, jsont::Null);
case 't':
return TokenizerInternal::readAtom(*this, "rue", 3, jsont::True);
case 'f':
return TokenizerInternal::readAtom(*this, "alse", 4, jsont::False);
case ' ': case '\t': case '\r': case '\n': // IETF RFC4627
// ignore whitespace and let the outer "while" do its thing
break;
case 0:
return setError(InvalidByte);
// when we read a value, we don't produce a token until we either reach
// end of input, a colon (then the value is a field name), a comma, or an
// array or object terminator.
case '"': {
_value.beginAtOffset(_input.offset);
while (!endOfInput()) {
b = _input.bytes[_input.offset++];
assert(_input.offset < _input.length);
switch (b) {
case '\\': {
// We must go buffered since the input segment != value
if (!_value.buffered) {
_value.buffered = true;
_value.buffer.assign(
(const char*)(_input.bytes+_value.offset),
_input.offset - _value.offset - 1
);
}
if (endOfInput()) {
return setError(PrematureEndOfInput);
}
b = _input.bytes[_input.offset++];
switch (b) {
case 'b': _value.buffer.append(1, '\x08'); break;
case 'f': _value.buffer.append(1, '\x0C'); break;
case 'n': _value.buffer.append(1, '\x0A'); break;
case 'r': _value.buffer.append(1, '\x0D'); break;
case 't': _value.buffer.append(1, '\x09'); break;
case 'u': {
// \uxxxx
if (availableInput() < 4) {
return setError(PrematureEndOfInput);
}
uint64_t utf16cp =
_xtou64(TokenizerInternal::currentInput(*this), 4);
_input.offset += 4;
if (utf16cp > 0xffff) {
return setError(MalformedUnicodeEscapeSequence);
}
uint16_t cp = (uint16_t)(0xffff & utf16cp);
// Append UTF-8 byte(s) representing the Unicode codepoint cp
if (cp < 0x80) {
// U+0000 - U+007F
uint8_t cp8 = ((uint8_t)cp);
_value.buffer.append(1, (char)cp8);
} else if (cp < 0x800) {
// U+0080 - U+07FF
uint8_t cp8 = (uint8_t)((cp >> 6) | 0xc0);
_value.buffer.append(1, (char)cp8);
cp8 = (uint8_t)((cp & 0x3f) | 0x80);
_value.buffer.append(1, (char)cp8);
} else if (cp >= 0xD800u && cp <= 0xDFFFu) {
// UTF-16 Surrogate pairs -- according to the UTF-8
// definition (RFC 3629) the high and low surrogate halves
// used by UTF-16 (U+D800 through U+DFFF) are not legal
// Unicode values, and the UTF-8 encoding of them is an
// invalid byte sequence. Instead of throwing an error, we
// substitute this character with the replacement character
// U+FFFD (UTF-8: EF,BF,BD).
_value.buffer.append("\xEF\xBF\xBD");
//
} else {
// U+0800 - U+FFFF
uint8_t cp8 = (uint8_t)((cp >> 12) | 0xe0);
_value.buffer.append(1, (char)cp8);
cp8 = (uint8_t)(((cp >> 6) & 0x3f) | 0x80);
_value.buffer.append(1, (char)cp8);
cp8 = (uint8_t)((cp & 0x3f) | 0x80);
_value.buffer.append(1, (char)cp8);
}
break;
}
default:
_value.buffer.append(1, (char)b); break;
}
break;
}
case '"':
goto after_initial_read_b;
case 0:
return setError(InvalidByte);
default: {
if (_value.buffered) {
// TODO: Make this efficient by appending chunks between
// boundaries instead of appending per-byte
_value.buffer.append(1, (char)b);
}
break;
}
} // switch(b)
} // while (!endOfInput())
after_initial_read_b:
if (b != '"') {
return setError(UnterminatedString);
}
if (!_value.buffered) {
_value.length = _input.offset - _value.offset - 1;
}
// is this a field name?
while (!endOfInput()) {
b = _input.bytes[_input.offset++];
switch (b) {
case ' ': case '\t': case '\r': case '\n': break;
case ':': return setToken(FieldName);
case ',': goto string_read_return_string;
case ']': case '}': {
--_input.offset; // rewind
goto string_read_return_string;
}
case 0: return setError(InvalidByte);
default: {
// Expected a comma or a colon
return setError(SyntaxError);
}
}
}
string_read_return_string:
return setToken(jsont::String);
}
case ',': {
if (_token == ObjectStart || _token == ArrayStart || _token == _Comma) {
return setError(UnexpectedComma);
}
_token = _Comma;
break;
}
default: {
if (isdigit((int)b) || b == '+' || b == '-') {
// We are reading a number
_value.beginAtOffset(_input.offset-1);
Token token = jsont::Integer;
while (!endOfInput()) {
b = _input.bytes[_input.offset++];
switch (b) {
case '0'...'9': break;
case '.': token = jsont::Float; break;
case 'E': case 'e': case '-': case '+': {
if (token != jsont::Float) {
return setError(MalformedNumberLiteral);
}
break;
}
default: {
if ( (_input.offset - _value.offset == 1) &&
(_input.bytes[_value.offset] == '-' ||
_input.bytes[_value.offset] == '+') ) {
return setError(MalformedNumberLiteral);
}
// rewind the byte that terminated this number literal
--_input.offset;
_value.length = _input.offset - _value.offset - 1;
return setToken(token);
}
}
}
return setToken(End);
} else {
return setError(InvalidByte);
}
}
}
}
return setToken(End);
}
enum {
kUTF8ByteVerbatim = 0,
kUTF8ByteEncode1, // "\u000x"
kUTF8ByteEncode2, // "\u00xx"
};
#define V kUTF8ByteVerbatim
#define E1 kUTF8ByteEncode1
#define E2 kUTF8ByteEncode2
static const uint8_t kUTF8ByteTable[256] = {
E1, E1, E1, E1, E1, E1, E1, E1, 'b', 't', 'n', E1, 'f', 'r', E1, E1, E2, E2,
E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, V, V, '"', V, V, V, V,
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
V, '\\', V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
V, V, V, V, V, V, V, V, V, V, V, E2, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
V, V, V, V, V, V, V, V, V, V
};
#undef V
#undef E1
#undef E2
// #ifndef __has_feature
// #define __has_feature(x) 0
// #endif
// #if defined(__cplusplus) && __has_feature(cxx_static_assert)
// #define JSONT_CONST_ASSERT(expr, error_msg) static_assert((expr), (error_msg))
// #elif __has_feature(c_static_assert)
// #define JSONT_CONST_ASSERT(expr, error_msg) _Static_assert((expr), (error_msg))
// #else
// #define JSONT_CONST_ASSERT(expr, error_msg) ((void)0)
// #endif
Builder& Builder::appendString(const uint8_t* v, size_t length, TextEncoding encoding) {
reserve(length + 2);
_buf[_size++] = '"';
assert(encoding == UTF8TextEncoding /* Currently only UTF-8 is supported */);
const uint8_t* end = v+length;
while (v != end) {
uint8_t s = kUTF8ByteTable[*v];
switch (s) {
case kUTF8ByteVerbatim:
_buf[_size++] = *v;
break;
case kUTF8ByteEncode1: {
assert(*v < 16);
size_t remainingSize = end-v+1+5; // five additional bytes needed
reserve(remainingSize);
_buf[_size] = '\\';
_buf[++_size] = 'u';
_buf[++_size] = '0';
_buf[++_size] = '0';
_buf[++_size] = '0';
_buf[++_size] = *v + (*v > 10 ? 55 : 48); // A-F : 0-9
++_size;
assert(_size <= _capacity);
break;
}
case kUTF8ByteEncode2: {
// Note: *v is guaranteed to be within the set [16,32),127. This is
// an affect of the kUTF8ByteTable lookup table and this code needs to
// be revised if the lookup table adds or removes any kUTF8ByteEncode.
assert((*v > 15 && *v < 32) || *v == 127);
size_t remainingSize = end-v+1+5; // five additional bytes needed
reserve(remainingSize);
_buf[_size] = '\\';
_buf[++_size] = 'u';
_buf[++_size] = '0';
_buf[++_size] = '0';
uint8_t b1 = (*v & 0xf0) / 16;
//uint8_t b1 = (*v & 0xf0) >> 4; // slightly faster but LE-specific
uint8_t b2 = *v & 0x0f;
_buf[++_size] = b1 + (b1 > 10 ? 55 : 48); // A-F : 0-9
_buf[++_size] = b2 + (b2 > 10 ? 55 : 48); // A-F : 0-9
++_size;
assert(_size <= _capacity);
break;
}
default:
// reverse solidus escape
size_t remainingSize = end-v+1+1; // one additional byte needed
reserve(remainingSize);
_buf[_size++] = '\\';
_buf[_size++] = s;
assert(_size <= _capacity);
break;
}
++v;
}
_buf[_size++] = '"';
assert(_size <= _capacity);
return *this;
}
#if JSONT_CXX_RVALUE_REFS
// Move constructor and assignment operator
Builder::Builder(Builder&& other)
: _buf(other._buf)
, _capacity(other._capacity)
, _size(other._size)
, _state(other._state) {
other._buf = 0;
}
Builder& Builder::operator=(Builder&& other) {
_buf = other._buf; other._buf = 0;
_capacity = other._capacity;
_size = other._size;
_state = other._state;
return *this;
}
#endif
Builder::Builder(const Builder& other)
: _buf(0)
, _capacity(other._capacity)
, _size(other._size)
, _state(other._state) {
_buf = (char*)malloc(_capacity);
memcpy((void*)_buf, (const void*)other._buf, _size);
}
Builder& Builder::operator=(const Builder& other) {
_capacity = other._capacity;
_size = other._size;
_state = other._state;
_buf = (char*)malloc(_capacity);
memcpy((void*)_buf, (const void*)other._buf, _size);
return *this;
}
} // namespace jsont