blob: 09b1e4567d24869a9e7035e1a6da6bb4de616c15 [file] [log] [blame]
Austin Schuhf417eaf2019-09-16 21:58:36 -07001#include "jsont.hh"
2
3namespace jsont {
4
5static const int8_t kHexValueTable[55] = {
6 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 0-0
7 -1, -1, -1, -1, -1, -1, -1,
8 10, 11, 12, 13, 14, 15, // A-F
9 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
10 -1, -1, -1, -1, -1, -1, -1,
11 10, 11, 12, 13, 14, 15 // a-f
12};
13
14static uint64_t _xtou64(const uint8_t* bytes, size_t len) {
15 uint64_t value = 0;
16 uint64_t cutoff = UINT64_MAX / 16;
17 int cutoff_digit = (int)(UINT64_MAX - cutoff * 16);
18
19 for (size_t i = 0; i != len; ++i) {
20 uint8_t b = bytes[i];
21 int8_t digit = (b > '0'-1 && b < 'f'+1) ? kHexValueTable[b-'0'] : -1;
22 if (b == -1 || // bad digit
23 (value > cutoff) || // overflow
24 ((value == cutoff) && (digit > cutoff_digit)) ) {
25 return UINT64_MAX;
26 } else {
27 value = (value * 16) + digit;
28 }
29 }
30
31 return value;
32}
33
34
35#ifdef NAN
36 #define _JSONT_NAN NAN
37#else
38 #define _JSONT_NAN nan(0)
39#endif
40
41
42const char* token_name(jsont::Token tok) {
43 switch (tok) {
44 case End: return "End";
45 case ObjectStart: return "ObjectStart";
46 case ObjectEnd: return "ObjectEnd";
47 case ArrayStart: return "ArrayStart";
48 case ArrayEnd: return "ArrayEnd";
49 case True: return "True";
50 case False: return "False";
51 case Null: return "Null";
52 case Integer: return "Integer";
53 case Float: return "Float";
54 case String: return "String";
55 case FieldName: return "FieldName";
56 default: return "?";
57 }
58}
59
60
61class TokenizerInternal {
62public:
63 inline static const uint8_t* currentInput(const Tokenizer& self) {
64 return self._input.bytes + self._input.offset;
65 }
66
67 inline static const Token& readAtom(Tokenizer& self, const char* str,
68 size_t len, const Token& token) {
69 if (self.availableInput() < len) {
70 return self.setError(Tokenizer::PrematureEndOfInput);
71 } else if (memcmp(currentInput(self), str, len) != 0) {
72 return self.setError(Tokenizer::InvalidByte);
73 } else {
74 self._input.offset += len;
75 return self.setToken(token);
76 }
77 }
78};
79
80
81Tokenizer::~Tokenizer() {}
82
83
84void Tokenizer::reset(const char* bytes, size_t length, TextEncoding encoding) {
85 assert(encoding == UTF8TextEncoding); // only supported encoding
86 _input.bytes = (const uint8_t*)bytes;
87 _input.length = length;
88 _input.offset = 0;
89 _error.code = UnspecifiedError;
90 // Advance to first token
91 next();
92}
93
94
95const char* Tokenizer::errorMessage() const {
96 switch (_error.code) {
97 case UnexpectedComma:
98 return "Unexpected comma";
99 case UnexpectedTrailingComma:
100 return "Unexpected trailing comma";
101 case InvalidByte:
102 return "Invalid input byte";
103 case PrematureEndOfInput:
104 return "Premature end of input";
105 case MalformedUnicodeEscapeSequence:
106 return "Malformed Unicode escape sequence";
107 case MalformedNumberLiteral:
108 return "Malformed number literal";
109 case UnterminatedString:
110 return "Unterminated string";
111 case SyntaxError:
112 return "Illegal JSON (syntax error)";
113 default:
114 return "Unspecified error";
115 }
116}
117
118
119size_t Tokenizer::dataValue(const char const** bytes) const {
120 if (!hasValue()) { return 0; }
121 if (_value.buffered) {
122 *bytes = (const char const*)_value.buffer.data();
123 return _value.buffer.size();
124 } else {
125 *bytes = (const char const*)(_input.bytes + _value.offset);
126 return _value.length;
127 }
128}
129
130
131double Tokenizer::floatValue() const {
132 if (!hasValue()) {
133 return _token == jsont::True ? 1.0 : 0.0;
134 }
135
136 const char* bytes;
137
138 if (_value.buffered) {
139 // edge-case since only happens with string values using escape sequences
140 bytes = _value.buffer.c_str();
141 } else {
142 bytes = (const char*)_input.bytes + _value.offset;
143 if (availableInput() == 0) {
144 // In this case where the data lies at the edge of the buffer, we can't pass
145 // it directly to atof, since there will be no sentinel byte. We are fine
146 // with a copy, since this is an edge case (only happens either for broken
147 // JSON or when the whole document is just a number).
148 char* buf[128];
149 if (_value.length > 127) {
150 // We are unable to interpret such a large literal in this edge-case
151 return _JSONT_NAN;
152 }
153 memcpy((void*)buf, (const void*)bytes, _value.length);
154 buf[_value.length] = '\0';
155 return strtod((const char*)buf, (char**)0);
156 }
157 }
158
159 return strtod(bytes, (char**)0);
160}
161
162
163int64_t Tokenizer::intValue() const {
164 if (!hasValue()) {
165 return _token == jsont::True ? 1LL : 0LL;
166 }
167
168 const char* bytes;
169
170 if (_value.buffered) {
171 // edge-case since only happens with string values using escape sequences
172 bytes = _value.buffer.c_str();
173 } else {
174 bytes = (const char*)_input.bytes + _value.offset;
175 if (availableInput() == 0) {
176 // In this case where the data lies at the edge of the buffer, we can't pass
177 // it directly to atof, since there will be no sentinel byte. We are fine
178 // with a copy, since this is an edge case (only happens either for broken
179 // JSON or when the whole document is just a number).
180 char* buf[21];
181 if (_value.length > 20) {
182 // We are unable to interpret such a large literal in this edge-case
183 return 0;
184 }
185 memcpy((void*)buf, (const void*)bytes, _value.length);
186 buf[_value.length] = '\0';
187 return strtoll((const char*)buf, (char**)0, 10);
188 }
189 }
190
191 return strtoll(bytes, (char**)0, 10);
192}
193
194
195const Token& Tokenizer::next() {
196 //
197 // { } [ ] n t f "
198 // | | | |
199 // | | | +- /[^"]*/ "
200 // | | +- a l s e
201 // | +- r u e
202 // +- u l l
203 //
204 while (!endOfInput()) {
205 uint8_t b = _input.bytes[_input.offset++];
206 switch (b) {
207 case '{': return setToken(ObjectStart);
208 case '}': {
209 if (_token == _Comma) { return setError(UnexpectedTrailingComma); }
210 return setToken(ObjectEnd);
211 }
212
213 case '[': return setToken(ArrayStart);
214 case ']': {
215 if (_token == _Comma) { return setError(UnexpectedTrailingComma); }
216 return setToken(ArrayEnd);
217 }
218
219 case 'n':
220 return TokenizerInternal::readAtom(*this, "ull", 3, jsont::Null);
221 case 't':
222 return TokenizerInternal::readAtom(*this, "rue", 3, jsont::True);
223 case 'f':
224 return TokenizerInternal::readAtom(*this, "alse", 4, jsont::False);
225
226 case ' ': case '\t': case '\r': case '\n': // IETF RFC4627
227 // ignore whitespace and let the outer "while" do its thing
228 break;
229
230 case 0:
231 return setError(InvalidByte);
232
233 // when we read a value, we don't produce a token until we either reach
234 // end of input, a colon (then the value is a field name), a comma, or an
235 // array or object terminator.
236
237 case '"': {
238 _value.beginAtOffset(_input.offset);
239
240 while (!endOfInput()) {
241 b = _input.bytes[_input.offset++];
242 assert(_input.offset < _input.length);
243
244 switch (b) {
245
246 case '\\': {
247 // We must go buffered since the input segment != value
248 if (!_value.buffered) {
249 _value.buffered = true;
250 _value.buffer.assign(
251 (const char*)(_input.bytes+_value.offset),
252 _input.offset - _value.offset - 1
253 );
254 }
255
256 if (endOfInput()) {
257 return setError(PrematureEndOfInput);
258 }
259
260 b = _input.bytes[_input.offset++];
261 switch (b) {
262 case 'b': _value.buffer.append(1, '\x08'); break;
263 case 'f': _value.buffer.append(1, '\x0C'); break;
264 case 'n': _value.buffer.append(1, '\x0A'); break;
265 case 'r': _value.buffer.append(1, '\x0D'); break;
266 case 't': _value.buffer.append(1, '\x09'); break;
267 case 'u': {
268 // \uxxxx
269 if (availableInput() < 4) {
270 return setError(PrematureEndOfInput);
271 }
272
273 uint64_t utf16cp =
274 _xtou64(TokenizerInternal::currentInput(*this), 4);
275 _input.offset += 4;
276
277 if (utf16cp > 0xffff) {
278 return setError(MalformedUnicodeEscapeSequence);
279 }
280
281 uint16_t cp = (uint16_t)(0xffff & utf16cp);
282
283 // Append UTF-8 byte(s) representing the Unicode codepoint cp
284 if (cp < 0x80) {
285 // U+0000 - U+007F
286 uint8_t cp8 = ((uint8_t)cp);
287 _value.buffer.append(1, (char)cp8);
288 } else if (cp < 0x800) {
289 // U+0080 - U+07FF
290 uint8_t cp8 = (uint8_t)((cp >> 6) | 0xc0);
291 _value.buffer.append(1, (char)cp8);
292 cp8 = (uint8_t)((cp & 0x3f) | 0x80);
293 _value.buffer.append(1, (char)cp8);
294 } else if (cp >= 0xD800u && cp <= 0xDFFFu) {
295 // UTF-16 Surrogate pairs -- according to the UTF-8
296 // definition (RFC 3629) the high and low surrogate halves
297 // used by UTF-16 (U+D800 through U+DFFF) are not legal
298 // Unicode values, and the UTF-8 encoding of them is an
299 // invalid byte sequence. Instead of throwing an error, we
300 // substitute this character with the replacement character
301 // U+FFFD (UTF-8: EF,BF,BD).
302 _value.buffer.append("\xEF\xBF\xBD");
303 //
304 } else {
305 // U+0800 - U+FFFF
306 uint8_t cp8 = (uint8_t)((cp >> 12) | 0xe0);
307 _value.buffer.append(1, (char)cp8);
308 cp8 = (uint8_t)(((cp >> 6) & 0x3f) | 0x80);
309 _value.buffer.append(1, (char)cp8);
310 cp8 = (uint8_t)((cp & 0x3f) | 0x80);
311 _value.buffer.append(1, (char)cp8);
312 }
313
314 break;
315 }
316 default:
317 _value.buffer.append(1, (char)b); break;
318 }
319 break;
320 }
321
322 case '"':
323 goto after_initial_read_b;
324
325 case 0:
326 return setError(InvalidByte);
327
328 default: {
329 if (_value.buffered) {
330 // TODO: Make this efficient by appending chunks between
331 // boundaries instead of appending per-byte
332 _value.buffer.append(1, (char)b);
333 }
334 break;
335 }
336 } // switch(b)
337 } // while (!endOfInput())
338
339 after_initial_read_b:
340 if (b != '"') {
341 return setError(UnterminatedString);
342 }
343
344 if (!_value.buffered) {
345 _value.length = _input.offset - _value.offset - 1;
346 }
347
348 // is this a field name?
349 while (!endOfInput()) {
350 b = _input.bytes[_input.offset++];
351 switch (b) {
352 case ' ': case '\t': case '\r': case '\n': break;
353 case ':': return setToken(FieldName);
354 case ',': goto string_read_return_string;
355 case ']': case '}': {
356 --_input.offset; // rewind
357 goto string_read_return_string;
358 }
359 case 0: return setError(InvalidByte);
360 default: {
361 // Expected a comma or a colon
362 return setError(SyntaxError);
363 }
364 }
365 }
366
367 string_read_return_string:
368 return setToken(jsont::String);
369 }
370
371 case ',': {
372 if (_token == ObjectStart || _token == ArrayStart || _token == _Comma) {
373 return setError(UnexpectedComma);
374 }
375 _token = _Comma;
376 break;
377 }
378
379 default: {
380 if (isdigit((int)b) || b == '+' || b == '-') {
381 // We are reading a number
382 _value.beginAtOffset(_input.offset-1);
383 Token token = jsont::Integer;
384
385 while (!endOfInput()) {
386 b = _input.bytes[_input.offset++];
387 switch (b) {
388 case '0'...'9': break;
389 case '.': token = jsont::Float; break;
390 case 'E': case 'e': case '-': case '+': {
391 if (token != jsont::Float) {
392 return setError(MalformedNumberLiteral);
393 }
394 break;
395 }
396 default: {
397 if ( (_input.offset - _value.offset == 1) &&
398 (_input.bytes[_value.offset] == '-' ||
399 _input.bytes[_value.offset] == '+') ) {
400 return setError(MalformedNumberLiteral);
401 }
402
403 // rewind the byte that terminated this number literal
404 --_input.offset;
405
406 _value.length = _input.offset - _value.offset - 1;
407 return setToken(token);
408 }
409 }
410 }
411 return setToken(End);
412 } else {
413 return setError(InvalidByte);
414 }
415 }
416 }
417 }
418
419 return setToken(End);
420}
421
422
423enum {
424 kUTF8ByteVerbatim = 0,
425 kUTF8ByteEncode1, // "\u000x"
426 kUTF8ByteEncode2, // "\u00xx"
427};
428#define V kUTF8ByteVerbatim
429#define E1 kUTF8ByteEncode1
430#define E2 kUTF8ByteEncode2
431static const uint8_t kUTF8ByteTable[256] = {
432 E1, E1, E1, E1, E1, E1, E1, E1, 'b', 't', 'n', E1, 'f', 'r', E1, E1, E2, E2,
433 E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, V, V, '"', V, V, V, V,
434 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
435 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
436 V, '\\', V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
437 V, V, V, V, V, V, V, V, V, V, V, E2, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
438 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
439 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
440 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
441 V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
442 V, V, V, V, V, V, V, V, V, V
443};
444#undef V
445#undef E1
446#undef E2
447
448// #ifndef __has_feature
449// #define __has_feature(x) 0
450// #endif
451// #if defined(__cplusplus) && __has_feature(cxx_static_assert)
452// #define JSONT_CONST_ASSERT(expr, error_msg) static_assert((expr), (error_msg))
453// #elif __has_feature(c_static_assert)
454// #define JSONT_CONST_ASSERT(expr, error_msg) _Static_assert((expr), (error_msg))
455// #else
456// #define JSONT_CONST_ASSERT(expr, error_msg) ((void)0)
457// #endif
458
459Builder& Builder::appendString(const uint8_t* v, size_t length, TextEncoding encoding) {
460 reserve(length + 2);
461 _buf[_size++] = '"';
462
463 assert(encoding == UTF8TextEncoding /* Currently only UTF-8 is supported */);
464
465 const uint8_t* end = v+length;
466 while (v != end) {
467 uint8_t s = kUTF8ByteTable[*v];
468 switch (s) {
469 case kUTF8ByteVerbatim:
470 _buf[_size++] = *v;
471 break;
472 case kUTF8ByteEncode1: {
473 assert(*v < 16);
474 size_t remainingSize = end-v+1+5; // five additional bytes needed
475 reserve(remainingSize);
476 _buf[_size] = '\\';
477 _buf[++_size] = 'u';
478 _buf[++_size] = '0';
479 _buf[++_size] = '0';
480 _buf[++_size] = '0';
481 _buf[++_size] = *v + (*v > 10 ? 55 : 48); // A-F : 0-9
482 ++_size;
483 assert(_size <= _capacity);
484 break;
485 }
486 case kUTF8ByteEncode2: {
487 // Note: *v is guaranteed to be within the set [16,32),127. This is
488 // an affect of the kUTF8ByteTable lookup table and this code needs to
489 // be revised if the lookup table adds or removes any kUTF8ByteEncode.
490 assert((*v > 15 && *v < 32) || *v == 127);
491 size_t remainingSize = end-v+1+5; // five additional bytes needed
492 reserve(remainingSize);
493 _buf[_size] = '\\';
494 _buf[++_size] = 'u';
495 _buf[++_size] = '0';
496 _buf[++_size] = '0';
497 uint8_t b1 = (*v & 0xf0) / 16;
498 //uint8_t b1 = (*v & 0xf0) >> 4; // slightly faster but LE-specific
499 uint8_t b2 = *v & 0x0f;
500 _buf[++_size] = b1 + (b1 > 10 ? 55 : 48); // A-F : 0-9
501 _buf[++_size] = b2 + (b2 > 10 ? 55 : 48); // A-F : 0-9
502 ++_size;
503 assert(_size <= _capacity);
504 break;
505 }
506 default:
507 // reverse solidus escape
508 size_t remainingSize = end-v+1+1; // one additional byte needed
509 reserve(remainingSize);
510 _buf[_size++] = '\\';
511 _buf[_size++] = s;
512 assert(_size <= _capacity);
513 break;
514 }
515
516 ++v;
517 }
518
519 _buf[_size++] = '"';
520 assert(_size <= _capacity);
521 return *this;
522}
523
524#if JSONT_CXX_RVALUE_REFS
525 // Move constructor and assignment operator
526 Builder::Builder(Builder&& other)
527 : _buf(other._buf)
528 , _capacity(other._capacity)
529 , _size(other._size)
530 , _state(other._state) {
531 other._buf = 0;
532 }
533
534 Builder& Builder::operator=(Builder&& other) {
535 _buf = other._buf; other._buf = 0;
536 _capacity = other._capacity;
537 _size = other._size;
538 _state = other._state;
539 return *this;
540 }
541#endif
542
543Builder::Builder(const Builder& other)
544 : _buf(0)
545 , _capacity(other._capacity)
546 , _size(other._size)
547 , _state(other._state) {
548 _buf = (char*)malloc(_capacity);
549 memcpy((void*)_buf, (const void*)other._buf, _size);
550}
551
552Builder& Builder::operator=(const Builder& other) {
553 _capacity = other._capacity;
554 _size = other._size;
555 _state = other._state;
556 _buf = (char*)malloc(_capacity);
557 memcpy((void*)_buf, (const void*)other._buf, _size);
558 return *this;
559}
560
561} // namespace jsont