| #include "jsont.hh" |
| |
| namespace jsont { |
| |
| static const int8_t kHexValueTable[55] = { |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 0-0 |
| -1, -1, -1, -1, -1, -1, -1, |
| 10, 11, 12, 13, 14, 15, // A-F |
| -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
| -1, -1, -1, -1, -1, -1, -1, |
| 10, 11, 12, 13, 14, 15 // a-f |
| }; |
| |
| static uint64_t _xtou64(const uint8_t* bytes, size_t len) { |
| uint64_t value = 0; |
| uint64_t cutoff = UINT64_MAX / 16; |
| int cutoff_digit = (int)(UINT64_MAX - cutoff * 16); |
| |
| for (size_t i = 0; i != len; ++i) { |
| uint8_t b = bytes[i]; |
| int8_t digit = (b > '0'-1 && b < 'f'+1) ? kHexValueTable[b-'0'] : -1; |
| if (b == -1 || // bad digit |
| (value > cutoff) || // overflow |
| ((value == cutoff) && (digit > cutoff_digit)) ) { |
| return UINT64_MAX; |
| } else { |
| value = (value * 16) + digit; |
| } |
| } |
| |
| return value; |
| } |
| |
| |
| #ifdef NAN |
| #define _JSONT_NAN NAN |
| #else |
| #define _JSONT_NAN nan(0) |
| #endif |
| |
| |
| const char* token_name(jsont::Token tok) { |
| switch (tok) { |
| case End: return "End"; |
| case ObjectStart: return "ObjectStart"; |
| case ObjectEnd: return "ObjectEnd"; |
| case ArrayStart: return "ArrayStart"; |
| case ArrayEnd: return "ArrayEnd"; |
| case True: return "True"; |
| case False: return "False"; |
| case Null: return "Null"; |
| case Integer: return "Integer"; |
| case Float: return "Float"; |
| case String: return "String"; |
| case FieldName: return "FieldName"; |
| default: return "?"; |
| } |
| } |
| |
| |
| class TokenizerInternal { |
| public: |
| inline static const uint8_t* currentInput(const Tokenizer& self) { |
| return self._input.bytes + self._input.offset; |
| } |
| |
| inline static const Token& readAtom(Tokenizer& self, const char* str, |
| size_t len, const Token& token) { |
| if (self.availableInput() < len) { |
| return self.setError(Tokenizer::PrematureEndOfInput); |
| } else if (memcmp(currentInput(self), str, len) != 0) { |
| return self.setError(Tokenizer::InvalidByte); |
| } else { |
| self._input.offset += len; |
| return self.setToken(token); |
| } |
| } |
| }; |
| |
| |
| Tokenizer::~Tokenizer() {} |
| |
| |
| void Tokenizer::reset(const char* bytes, size_t length, TextEncoding encoding) { |
| assert(encoding == UTF8TextEncoding); // only supported encoding |
| _input.bytes = (const uint8_t*)bytes; |
| _input.length = length; |
| _input.offset = 0; |
| _error.code = UnspecifiedError; |
| // Advance to first token |
| next(); |
| } |
| |
| |
| const char* Tokenizer::errorMessage() const { |
| switch (_error.code) { |
| case UnexpectedComma: |
| return "Unexpected comma"; |
| case UnexpectedTrailingComma: |
| return "Unexpected trailing comma"; |
| case InvalidByte: |
| return "Invalid input byte"; |
| case PrematureEndOfInput: |
| return "Premature end of input"; |
| case MalformedUnicodeEscapeSequence: |
| return "Malformed Unicode escape sequence"; |
| case MalformedNumberLiteral: |
| return "Malformed number literal"; |
| case UnterminatedString: |
| return "Unterminated string"; |
| case SyntaxError: |
| return "Illegal JSON (syntax error)"; |
| default: |
| return "Unspecified error"; |
| } |
| } |
| |
| |
| size_t Tokenizer::dataValue(const char const** bytes) const { |
| if (!hasValue()) { return 0; } |
| if (_value.buffered) { |
| *bytes = (const char const*)_value.buffer.data(); |
| return _value.buffer.size(); |
| } else { |
| *bytes = (const char const*)(_input.bytes + _value.offset); |
| return _value.length; |
| } |
| } |
| |
| |
| double Tokenizer::floatValue() const { |
| if (!hasValue()) { |
| return _token == jsont::True ? 1.0 : 0.0; |
| } |
| |
| const char* bytes; |
| |
| if (_value.buffered) { |
| // edge-case since only happens with string values using escape sequences |
| bytes = _value.buffer.c_str(); |
| } else { |
| bytes = (const char*)_input.bytes + _value.offset; |
| if (availableInput() == 0) { |
| // In this case where the data lies at the edge of the buffer, we can't pass |
| // it directly to atof, since there will be no sentinel byte. We are fine |
| // with a copy, since this is an edge case (only happens either for broken |
| // JSON or when the whole document is just a number). |
| char* buf[128]; |
| if (_value.length > 127) { |
| // We are unable to interpret such a large literal in this edge-case |
| return _JSONT_NAN; |
| } |
| memcpy((void*)buf, (const void*)bytes, _value.length); |
| buf[_value.length] = '\0'; |
| return strtod((const char*)buf, (char**)0); |
| } |
| } |
| |
| return strtod(bytes, (char**)0); |
| } |
| |
| |
| int64_t Tokenizer::intValue() const { |
| if (!hasValue()) { |
| return _token == jsont::True ? 1LL : 0LL; |
| } |
| |
| const char* bytes; |
| |
| if (_value.buffered) { |
| // edge-case since only happens with string values using escape sequences |
| bytes = _value.buffer.c_str(); |
| } else { |
| bytes = (const char*)_input.bytes + _value.offset; |
| if (availableInput() == 0) { |
| // In this case where the data lies at the edge of the buffer, we can't pass |
| // it directly to atof, since there will be no sentinel byte. We are fine |
| // with a copy, since this is an edge case (only happens either for broken |
| // JSON or when the whole document is just a number). |
| char* buf[21]; |
| if (_value.length > 20) { |
| // We are unable to interpret such a large literal in this edge-case |
| return 0; |
| } |
| memcpy((void*)buf, (const void*)bytes, _value.length); |
| buf[_value.length] = '\0'; |
| return strtoll((const char*)buf, (char**)0, 10); |
| } |
| } |
| |
| return strtoll(bytes, (char**)0, 10); |
| } |
| |
| |
| const Token& Tokenizer::next() { |
| // |
| // { } [ ] n t f " |
| // | | | | |
| // | | | +- /[^"]*/ " |
| // | | +- a l s e |
| // | +- r u e |
| // +- u l l |
| // |
| while (!endOfInput()) { |
| uint8_t b = _input.bytes[_input.offset++]; |
| switch (b) { |
| case '{': return setToken(ObjectStart); |
| case '}': { |
| if (_token == _Comma) { return setError(UnexpectedTrailingComma); } |
| return setToken(ObjectEnd); |
| } |
| |
| case '[': return setToken(ArrayStart); |
| case ']': { |
| if (_token == _Comma) { return setError(UnexpectedTrailingComma); } |
| return setToken(ArrayEnd); |
| } |
| |
| case 'n': |
| return TokenizerInternal::readAtom(*this, "ull", 3, jsont::Null); |
| case 't': |
| return TokenizerInternal::readAtom(*this, "rue", 3, jsont::True); |
| case 'f': |
| return TokenizerInternal::readAtom(*this, "alse", 4, jsont::False); |
| |
| case ' ': case '\t': case '\r': case '\n': // IETF RFC4627 |
| // ignore whitespace and let the outer "while" do its thing |
| break; |
| |
| case 0: |
| return setError(InvalidByte); |
| |
| // when we read a value, we don't produce a token until we either reach |
| // end of input, a colon (then the value is a field name), a comma, or an |
| // array or object terminator. |
| |
| case '"': { |
| _value.beginAtOffset(_input.offset); |
| |
| while (!endOfInput()) { |
| b = _input.bytes[_input.offset++]; |
| assert(_input.offset < _input.length); |
| |
| switch (b) { |
| |
| case '\\': { |
| // We must go buffered since the input segment != value |
| if (!_value.buffered) { |
| _value.buffered = true; |
| _value.buffer.assign( |
| (const char*)(_input.bytes+_value.offset), |
| _input.offset - _value.offset - 1 |
| ); |
| } |
| |
| if (endOfInput()) { |
| return setError(PrematureEndOfInput); |
| } |
| |
| b = _input.bytes[_input.offset++]; |
| switch (b) { |
| case 'b': _value.buffer.append(1, '\x08'); break; |
| case 'f': _value.buffer.append(1, '\x0C'); break; |
| case 'n': _value.buffer.append(1, '\x0A'); break; |
| case 'r': _value.buffer.append(1, '\x0D'); break; |
| case 't': _value.buffer.append(1, '\x09'); break; |
| case 'u': { |
| // \uxxxx |
| if (availableInput() < 4) { |
| return setError(PrematureEndOfInput); |
| } |
| |
| uint64_t utf16cp = |
| _xtou64(TokenizerInternal::currentInput(*this), 4); |
| _input.offset += 4; |
| |
| if (utf16cp > 0xffff) { |
| return setError(MalformedUnicodeEscapeSequence); |
| } |
| |
| uint16_t cp = (uint16_t)(0xffff & utf16cp); |
| |
| // Append UTF-8 byte(s) representing the Unicode codepoint cp |
| if (cp < 0x80) { |
| // U+0000 - U+007F |
| uint8_t cp8 = ((uint8_t)cp); |
| _value.buffer.append(1, (char)cp8); |
| } else if (cp < 0x800) { |
| // U+0080 - U+07FF |
| uint8_t cp8 = (uint8_t)((cp >> 6) | 0xc0); |
| _value.buffer.append(1, (char)cp8); |
| cp8 = (uint8_t)((cp & 0x3f) | 0x80); |
| _value.buffer.append(1, (char)cp8); |
| } else if (cp >= 0xD800u && cp <= 0xDFFFu) { |
| // UTF-16 Surrogate pairs -- according to the UTF-8 |
| // definition (RFC 3629) the high and low surrogate halves |
| // used by UTF-16 (U+D800 through U+DFFF) are not legal |
| // Unicode values, and the UTF-8 encoding of them is an |
| // invalid byte sequence. Instead of throwing an error, we |
| // substitute this character with the replacement character |
| // U+FFFD (UTF-8: EF,BF,BD). |
| _value.buffer.append("\xEF\xBF\xBD"); |
| // |
| } else { |
| // U+0800 - U+FFFF |
| uint8_t cp8 = (uint8_t)((cp >> 12) | 0xe0); |
| _value.buffer.append(1, (char)cp8); |
| cp8 = (uint8_t)(((cp >> 6) & 0x3f) | 0x80); |
| _value.buffer.append(1, (char)cp8); |
| cp8 = (uint8_t)((cp & 0x3f) | 0x80); |
| _value.buffer.append(1, (char)cp8); |
| } |
| |
| break; |
| } |
| default: |
| _value.buffer.append(1, (char)b); break; |
| } |
| break; |
| } |
| |
| case '"': |
| goto after_initial_read_b; |
| |
| case 0: |
| return setError(InvalidByte); |
| |
| default: { |
| if (_value.buffered) { |
| // TODO: Make this efficient by appending chunks between |
| // boundaries instead of appending per-byte |
| _value.buffer.append(1, (char)b); |
| } |
| break; |
| } |
| } // switch(b) |
| } // while (!endOfInput()) |
| |
| after_initial_read_b: |
| if (b != '"') { |
| return setError(UnterminatedString); |
| } |
| |
| if (!_value.buffered) { |
| _value.length = _input.offset - _value.offset - 1; |
| } |
| |
| // is this a field name? |
| while (!endOfInput()) { |
| b = _input.bytes[_input.offset++]; |
| switch (b) { |
| case ' ': case '\t': case '\r': case '\n': break; |
| case ':': return setToken(FieldName); |
| case ',': goto string_read_return_string; |
| case ']': case '}': { |
| --_input.offset; // rewind |
| goto string_read_return_string; |
| } |
| case 0: return setError(InvalidByte); |
| default: { |
| // Expected a comma or a colon |
| return setError(SyntaxError); |
| } |
| } |
| } |
| |
| string_read_return_string: |
| return setToken(jsont::String); |
| } |
| |
| case ',': { |
| if (_token == ObjectStart || _token == ArrayStart || _token == _Comma) { |
| return setError(UnexpectedComma); |
| } |
| _token = _Comma; |
| break; |
| } |
| |
| default: { |
| if (isdigit((int)b) || b == '+' || b == '-') { |
| // We are reading a number |
| _value.beginAtOffset(_input.offset-1); |
| Token token = jsont::Integer; |
| |
| while (!endOfInput()) { |
| b = _input.bytes[_input.offset++]; |
| switch (b) { |
| case '0'...'9': break; |
| case '.': token = jsont::Float; break; |
| case 'E': case 'e': case '-': case '+': { |
| if (token != jsont::Float) { |
| return setError(MalformedNumberLiteral); |
| } |
| break; |
| } |
| default: { |
| if ( (_input.offset - _value.offset == 1) && |
| (_input.bytes[_value.offset] == '-' || |
| _input.bytes[_value.offset] == '+') ) { |
| return setError(MalformedNumberLiteral); |
| } |
| |
| // rewind the byte that terminated this number literal |
| --_input.offset; |
| |
| _value.length = _input.offset - _value.offset - 1; |
| return setToken(token); |
| } |
| } |
| } |
| return setToken(End); |
| } else { |
| return setError(InvalidByte); |
| } |
| } |
| } |
| } |
| |
| return setToken(End); |
| } |
| |
| |
| enum { |
| kUTF8ByteVerbatim = 0, |
| kUTF8ByteEncode1, // "\u000x" |
| kUTF8ByteEncode2, // "\u00xx" |
| }; |
| #define V kUTF8ByteVerbatim |
| #define E1 kUTF8ByteEncode1 |
| #define E2 kUTF8ByteEncode2 |
| static const uint8_t kUTF8ByteTable[256] = { |
| E1, E1, E1, E1, E1, E1, E1, E1, 'b', 't', 'n', E1, 'f', 'r', E1, E1, E2, E2, |
| E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, V, V, '"', V, V, V, V, |
| V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, |
| V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, |
| V, '\\', V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, |
| V, V, V, V, V, V, V, V, V, V, V, E2, V, V, V, V, V, V, V, V, V, V, V, V, V, V, |
| V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, |
| V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, |
| V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, |
| V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, |
| V, V, V, V, V, V, V, V, V, V |
| }; |
| #undef V |
| #undef E1 |
| #undef E2 |
| |
| // #ifndef __has_feature |
| // #define __has_feature(x) 0 |
| // #endif |
| // #if defined(__cplusplus) && __has_feature(cxx_static_assert) |
| // #define JSONT_CONST_ASSERT(expr, error_msg) static_assert((expr), (error_msg)) |
| // #elif __has_feature(c_static_assert) |
| // #define JSONT_CONST_ASSERT(expr, error_msg) _Static_assert((expr), (error_msg)) |
| // #else |
| // #define JSONT_CONST_ASSERT(expr, error_msg) ((void)0) |
| // #endif |
| |
| Builder& Builder::appendString(const uint8_t* v, size_t length, TextEncoding encoding) { |
| reserve(length + 2); |
| _buf[_size++] = '"'; |
| |
| assert(encoding == UTF8TextEncoding /* Currently only UTF-8 is supported */); |
| |
| const uint8_t* end = v+length; |
| while (v != end) { |
| uint8_t s = kUTF8ByteTable[*v]; |
| switch (s) { |
| case kUTF8ByteVerbatim: |
| _buf[_size++] = *v; |
| break; |
| case kUTF8ByteEncode1: { |
| assert(*v < 16); |
| size_t remainingSize = end-v+1+5; // five additional bytes needed |
| reserve(remainingSize); |
| _buf[_size] = '\\'; |
| _buf[++_size] = 'u'; |
| _buf[++_size] = '0'; |
| _buf[++_size] = '0'; |
| _buf[++_size] = '0'; |
| _buf[++_size] = *v + (*v > 10 ? 55 : 48); // A-F : 0-9 |
| ++_size; |
| assert(_size <= _capacity); |
| break; |
| } |
| case kUTF8ByteEncode2: { |
| // Note: *v is guaranteed to be within the set [16,32),127. This is |
| // an affect of the kUTF8ByteTable lookup table and this code needs to |
| // be revised if the lookup table adds or removes any kUTF8ByteEncode. |
| assert((*v > 15 && *v < 32) || *v == 127); |
| size_t remainingSize = end-v+1+5; // five additional bytes needed |
| reserve(remainingSize); |
| _buf[_size] = '\\'; |
| _buf[++_size] = 'u'; |
| _buf[++_size] = '0'; |
| _buf[++_size] = '0'; |
| uint8_t b1 = (*v & 0xf0) / 16; |
| //uint8_t b1 = (*v & 0xf0) >> 4; // slightly faster but LE-specific |
| uint8_t b2 = *v & 0x0f; |
| _buf[++_size] = b1 + (b1 > 10 ? 55 : 48); // A-F : 0-9 |
| _buf[++_size] = b2 + (b2 > 10 ? 55 : 48); // A-F : 0-9 |
| ++_size; |
| assert(_size <= _capacity); |
| break; |
| } |
| default: |
| // reverse solidus escape |
| size_t remainingSize = end-v+1+1; // one additional byte needed |
| reserve(remainingSize); |
| _buf[_size++] = '\\'; |
| _buf[_size++] = s; |
| assert(_size <= _capacity); |
| break; |
| } |
| |
| ++v; |
| } |
| |
| _buf[_size++] = '"'; |
| assert(_size <= _capacity); |
| return *this; |
| } |
| |
| #if JSONT_CXX_RVALUE_REFS |
| // Move constructor and assignment operator |
| Builder::Builder(Builder&& other) |
| : _buf(other._buf) |
| , _capacity(other._capacity) |
| , _size(other._size) |
| , _state(other._state) { |
| other._buf = 0; |
| } |
| |
| Builder& Builder::operator=(Builder&& other) { |
| _buf = other._buf; other._buf = 0; |
| _capacity = other._capacity; |
| _size = other._size; |
| _state = other._state; |
| return *this; |
| } |
| #endif |
| |
| Builder::Builder(const Builder& other) |
| : _buf(0) |
| , _capacity(other._capacity) |
| , _size(other._size) |
| , _state(other._state) { |
| _buf = (char*)malloc(_capacity); |
| memcpy((void*)_buf, (const void*)other._buf, _size); |
| } |
| |
| Builder& Builder::operator=(const Builder& other) { |
| _capacity = other._capacity; |
| _size = other._size; |
| _state = other._state; |
| _buf = (char*)malloc(_capacity); |
| memcpy((void*)_buf, (const void*)other._buf, _size); |
| return *this; |
| } |
| |
| } // namespace jsont |