blob: 5863c7a9bcbd7d6ca0e0d58b69f8e9a2ccb734e2 [file] [log] [blame]
Austin Schuhf417eaf2019-09-16 21:58:36 -07001// JSON Tokenizer. Copyright (c) 2012, Rasmus Andersson. All rights reserved.
2// Use of this source code is governed by a MIT-style license that can be
3// found in the LICENSE file.
4#include <stdlib.h>
5#include <stdint.h>
6#include <stdbool.h>
7#include <limits.h>
8#include <ctype.h> // isdigit
9#include <errno.h>
10#include <string.h>
11#include <math.h>
12#include <assert.h>
13
14// Error info
15#ifndef JSONT_ERRINFO_CUSTOM
16#define jsont_err_t const char*
17#define DEF_EM(NAME, msg) static jsont_err_t JSONT_ERRINFO_##NAME = msg
18DEF_EM(STACK_SIZE, "Stack size limit exceeded");
19DEF_EM(UNEXPECTED_OBJECT_END,
20 "Unexpected end of object while not in an object");
21DEF_EM(UNEXPECTED_ARRAY_END, "Unexpected end of array while not in an array");
22DEF_EM(UNEXPECTED_COMMA, "Unexpected \",\"");
23DEF_EM(UNEXPECTED_COLON, "Unexpected \":\"");
24DEF_EM(UNEXPECTED, "Unexpected input");
25DEF_EM(UNEXPECTED_UNICODE_SEQ, "Malformed unicode encoded sequence in string");
26#undef DEF_EM
27#endif
28
29// Size of stack used for structures (in/out array and objects). This value
30// is a balance between memory size of a ctx and how many levels deep the
31// tokenizer can go.
32#define _STRUCT_TYPE_STACK_SIZE 512
33#define _VALUE_BUF_MIN_SIZE 64
34
35static const uint8_t kHexValueTable[55] = {
36 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 0-0
37 -1, -1, -1, -1, -1, -1, -1,
38 10, 11, 12, 13, 14, 15, // A-F
39 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
40 -1, -1, -1, -1, -1, -1, -1,
41 10, 11, 12, 13, 14, 15 // a-f
42};
43
44typedef uint8_t jsont_tok_t;
45
46typedef struct jsont_ctx {
47 void* user_data;
48 const uint8_t* input_buf;
49 const uint8_t* input_buf_ptr;
50 size_t input_len;
51 const uint8_t* input_buf_value_start;
52 const uint8_t* input_buf_value_end;
53 struct {
54 uint8_t* data;
55 size_t size;
56 size_t length;
57 bool inuse;
58 } value_buf;
59 jsont_err_t error_info;
60 jsont_tok_t curr_tok;
61 size_t st_stack_size;
62 size_t st_stack_len;
63 jsont_tok_t st_stack[_STRUCT_TYPE_STACK_SIZE];
64} jsont_ctx_t;
65
66#define _JSONT_IN_SOURCE
67#include <jsont.h>
68
69unsigned long _hex_str_to_ul(const uint8_t* bytes, size_t len) {
70 unsigned long value = 0;
71 unsigned long cutoff = ULONG_MAX / 16;
72 int cutoff_digit = (int)(ULONG_MAX - cutoff * 16);
73
74 for (size_t i = 0; i != len; ++i) {
75 uint8_t b = bytes[i];
76 int digit = (b > '0'-1 && b < 'f'+1) ? kHexValueTable[b-'0'] : -1;
77 if (b == -1 || // bad digit
78 (value > cutoff) || // overflow
79 ((value == cutoff) && (digit > cutoff_digit)) ) {
80 return ULONG_MAX;
81 } else {
82 value = (value * 16) + digit;
83 }
84 }
85
86 return value;
87}
88
89jsont_ctx_t* jsont_create(void* user_data) {
90 jsont_ctx_t* ctx = (jsont_ctx_t*)calloc(1, sizeof(jsont_ctx_t));
91 ctx->user_data = user_data;
92 ctx->st_stack_size = _STRUCT_TYPE_STACK_SIZE;
93 return ctx;
94}
95
96void jsont_destroy(jsont_ctx_t* ctx) {
97 if (ctx->value_buf.data != 0) {
98 free(ctx->value_buf.data);
99 }
100 free(ctx);
101}
102
103void jsont_reset(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length) {
104 ctx->input_buf_ptr = ctx->input_buf = bytes;
105 ctx->input_len = length;
106 ctx->st_stack_len = 0;
107 ctx->curr_tok = JSONT_END;
108 ctx->input_buf_value_start = 0;
109 ctx->input_buf_value_end = 0;
110 ctx->value_buf.length = 0;
111 ctx->value_buf.inuse = false;
112 ctx->error_info = 0;
113}
114
115jsont_tok_t jsont_current(const jsont_ctx_t* ctx) {
116 return ctx->curr_tok;
117}
118
119void* jsont_user_data(const jsont_ctx_t* ctx) {
120 return ctx->user_data;
121}
122
123// Get the current/last byte read. Suitable for debugging JSONT_ERR
124uint8_t jsont_current_byte(jsont_ctx_t* ctx) {
125 return (ctx->input_buf_ptr == 0) ? 0 : *(ctx->input_buf_ptr-1);
126}
127
128size_t jsont_current_offset(jsont_ctx_t* ctx) {
129 return ctx->input_buf_ptr - ctx->input_buf;
130}
131
132jsont_err_t jsont_error_info(jsont_ctx_t* ctx) {
133 return ctx->error_info;
134}
135
136inline static bool _no_value(jsont_ctx_t* ctx) {
137 return ctx->input_buf_value_start == 0
138 || ctx->curr_tok < _JSONT_VALUES_START
139 || ctx->curr_tok > _JSONT_VALUES_END;
140}
141
142inline static size_t _input_avail(jsont_ctx_t* ctx) {
143 return ctx->input_len - (ctx->input_buf_ptr - ctx->input_buf);
144}
145
146inline static uint8_t _next_byte(jsont_ctx_t* ctx) {
147 return (_input_avail(ctx) == 0) ? 0 : *(ctx->input_buf_ptr++);
148}
149
150inline static jsont_tok_t _st_stack_top(const jsont_ctx_t* ctx) {
151 return (ctx->st_stack_len != 0) ? ctx->st_stack[ctx->st_stack_len-1]
152 : JSONT_END;
153}
154
155size_t jsont_data_value(jsont_ctx_t* ctx, const uint8_t** bytes) {
156 if (_no_value(ctx)) {
157 return 0;
158 } else {
159 if (ctx->value_buf.inuse) {
160 *bytes = ctx->value_buf.data;
161 return ctx->value_buf.length;
162 } else {
163 *bytes = ctx->input_buf_value_start;
164 return ctx->input_buf_value_end - ctx->input_buf_value_start;
165 }
166 }
167}
168
169bool jsont_data_equals(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length) {
170 if (ctx->value_buf.inuse) {
171 return (ctx->value_buf.length == length) &&
172 (memcmp((const void*)ctx->value_buf.data,
173 (const void*)bytes, length) == 0);
174 } else {
175 return (ctx->input_buf_value_end - ctx->input_buf_value_start == length) &&
176 (memcmp((const void*)ctx->input_buf_value_start,
177 (const void*)bytes, length) == 0);
178 }
179}
180
181char* jsont_strcpy_value(jsont_ctx_t* ctx) {
182 if (_no_value(ctx)) {
183 return 0;
184 } else {
185 const uint8_t* bytes = 0;
186 size_t len = jsont_data_value(ctx, &bytes);
187 char* buf = (char*)malloc(len+1);
188 if (memcpy((void*)buf, (const void*)bytes, len) != buf) {
189 return 0;
190 }
191 buf[len] = 0;
192 return buf;
193 }
194}
195
196int64_t jsont_int_value(jsont_ctx_t* ctx) {
197 if (_no_value(ctx)) {
198 return INT64_MIN;
199 }
200
201 const uint8_t* start = 0;
202 size_t len = jsont_data_value(ctx, &start);
203 if (len == 0) {
204 return INT64_MIN;
205 }
206 const uint8_t* end = start + len + 1;
207
208 bool negative;
209 uint8_t b = *start++;
210 const int base = 10;
211
212 if (b == '-') {
213 negative = true;
214 b = *start++;
215 if (start == end) {
216 errno = EINVAL;
217 return INT64_MIN;
218 }
219 } else {
220 negative = false;
221 if (b == '+') {
222 b = *start++;
223 if (start == end) {
224 errno = EINVAL;
225 return INT64_MIN;
226 }
227 }
228 }
229
230 uint64_t acc = 0;
231 int any = 0;
232 uint64_t cutoff = negative
233 ? (uint64_t)-(INT64_MIN + INT64_MAX) + INT64_MAX
234 : INT64_MAX;
235 int cutlim = cutoff % base;
236 cutoff /= base;
237 for ( ; start != end; b = *start++) {
238 if (b >= '0' && b <= '9') b -= '0'; else break;
239 if (any < 0 || acc > cutoff || (acc == cutoff && b > cutlim)) {
240 any = -1;
241 } else {
242 any = 1;
243 acc *= base;
244 acc += b;
245 }
246 }
247
248 if (any < 0) {
249 acc = negative ? INT64_MIN : INT64_MAX;
250 errno = ERANGE;
251 } else if (!any) {
252 errno = EINVAL;
253 return INT64_MIN;
254 } else if (negative) {
255 acc = -acc;
256 }
257
258 return (int64_t)acc;
259}
260
261#ifdef NAN
262 #define _JSONT_NAN NAN
263#else
264 #define _JSONT_NAN nan(0)
265#endif
266
267double jsont_float_value(jsont_ctx_t* ctx) {
268 // Note: This might cause a segfault if the input is at the end, so we cause
269 // an error if we try to read a float value while at the end of the input.
270 if (_no_value(ctx) || _input_avail(ctx) == 0) {
271 errno = EINVAL;
272 return _JSONT_NAN;
273 }
274
275 const uint8_t* bytes = 0;
276 size_t len = jsont_data_value(ctx, &bytes);
277 if (len == 0) {
278 return _JSONT_NAN;
279 }
280 return atof((const char*)bytes);
281}
282
283inline static jsont_tok_t _set_tok(jsont_ctx_t* ctx, jsont_tok_t tok) {
284 ctx->curr_tok = tok;
285
286 if (tok != JSONT_END) {
287 if (tok == JSONT_OBJECT_START) {
288 if (ctx->st_stack_len == ctx->st_stack_size) {
289 ctx->error_info = JSONT_ERRINFO_STACK_SIZE;
290 return ctx->curr_tok = JSONT_ERR; // TODO: Grow st_stack
291 }
292 ctx->st_stack[ctx->st_stack_len++] = JSONT_OBJECT_START;
293
294 } else if (tok == JSONT_OBJECT_END) {
295 if (_st_stack_top(ctx) != JSONT_OBJECT_START) {
296 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_OBJECT_END;
297 return ctx->curr_tok = JSONT_ERR;
298 }
299 --ctx->st_stack_len;
300
301 } else if (tok == JSONT_ARRAY_START) {
302 if (ctx->st_stack_len == ctx->st_stack_size) {
303 ctx->error_info = JSONT_ERRINFO_STACK_SIZE;
304 return ctx->curr_tok = JSONT_ERR;
305 }
306 ctx->st_stack[ctx->st_stack_len++] = JSONT_ARRAY_START;
307
308 } else if (tok == JSONT_ARRAY_END) {
309 if (_st_stack_top(ctx) != JSONT_ARRAY_START) {
310 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_ARRAY_END;
311 return ctx->curr_tok = JSONT_ERR;
312 }
313 --ctx->st_stack_len;
314 }
315 }
316
317 return tok;
318}
319inline static void _rewind_one_byte(jsont_ctx_t* ctx) {
320 --ctx->input_buf_ptr;
321}
322inline static void _rewind_bytes(jsont_ctx_t* ctx, size_t n) {
323 ctx->input_buf_ptr -= n;
324}
325inline static void _skip_bytes(jsont_ctx_t* ctx, size_t n) {
326 ctx->input_buf_ptr += n;
327}
328inline static uint8_t _read_atom(jsont_ctx_t* ctx, size_t slacklen,
329 jsont_tok_t tok) {
330 if (_input_avail(ctx) < slacklen) {
331 // rewind and wait for buffer fill
332 _rewind_one_byte(ctx);
333 return _set_tok(ctx, JSONT_END);
334 } else {
335 _skip_bytes(ctx, slacklen); // e.g. "ull" after "n" or "alse" after "f"
336 return _set_tok(ctx, tok);
337 }
338}
339inline static bool _expects_field_name(jsont_ctx_t* ctx) {
340 return ( ctx->curr_tok == JSONT_OBJECT_START
341 || ( ctx->curr_tok == _JSONT_COMMA
342 && _st_stack_top(ctx) == JSONT_OBJECT_START) );
343}
344
345static void _value_buf_append(jsont_ctx_t* ctx, const uint8_t* data, size_t len) {
346 //printf("_value_buf_append(<ctx>, %p, %zu)\n", data, len);
347 if (ctx->value_buf.size == 0) {
348 assert(ctx->value_buf.data == 0);
349 ctx->value_buf.length = len;
350 ctx->value_buf.size = len * 2;
351 if (ctx->value_buf.size < _VALUE_BUF_MIN_SIZE) {
352 ctx->value_buf.size = _VALUE_BUF_MIN_SIZE;
353 }
354 ctx->value_buf.data = (uint8_t*)malloc(ctx->value_buf.size);
355 if (len != 0) {
356 memcpy(ctx->value_buf.data, data, len);
357 }
358 } else {
359 if (ctx->value_buf.length + len > ctx->value_buf.size) {
360 size_t new_size = ctx->value_buf.size + (len * 2);
361 ctx->value_buf.data = realloc(ctx->value_buf.data, new_size);
362 assert(ctx->value_buf.data != 0);
363 ctx->value_buf.size = new_size;
364 }
365 memcpy(ctx->value_buf.data + ctx->value_buf.length, data, len);
366 ctx->value_buf.length += len;
367 }
368 ctx->value_buf.inuse = true;
369}
370
371jsont_tok_t jsont_next(jsont_ctx_t* ctx) {
372 //
373 // { } [ ] n t f "
374 // | | | |
375 // | | | +- /[^"]*/ "
376 // | | +- a l s e
377 // | +- r u e
378 // +- u l l
379 //
380 while (1) {
381 uint8_t b = _next_byte(ctx);
382 switch (b) {
383 case '{': return _set_tok(ctx, JSONT_OBJECT_START);
384 case '}': return _set_tok(ctx, JSONT_OBJECT_END);
385 case '[': return _set_tok(ctx, JSONT_ARRAY_START);
386 case ']': return _set_tok(ctx, JSONT_ARRAY_END);
387 case 'n': return _read_atom(ctx, 3, JSONT_NULL);
388 case 't': return _read_atom(ctx, 3, JSONT_TRUE);
389 case 'f': return _read_atom(ctx, 4, JSONT_FALSE);
390 case '"': {
391 ctx->input_buf_value_start = ctx->input_buf_ptr;
392 ctx->value_buf.inuse = false;
393 ctx->value_buf.length = 0;
394 uint8_t prev_b = 0;
395 while (1) {
396 b = _next_byte(ctx);
397
398 if (b == '\\') {
399 if (prev_b == '\\') {
400 // This is an actual '\'.
401 assert(ctx->value_buf.inuse == true); // should be buffering
402 _value_buf_append(ctx, ctx->input_buf_ptr-1, 1); // append "\"
403 } else {
404 // Okay, this is an escape prefix. Move to buffering value.
405 if (ctx->value_buf.inuse == 0) {
406 _value_buf_append(ctx,
407 ctx->input_buf_value_start,
408 // any data before the "\":
409 (ctx->input_buf_ptr-1 - ctx->input_buf_value_start) );
410 }
411 }
412 } else {
413 // Any byte except '\'
414
415 if (prev_b == '\\') {
416 // Currently just after an escape character
417 assert(ctx->value_buf.inuse == true); // should be buffering
418
419 // JSON specifies a few "magic" characters that have a different
420 // meaning than their value:
421 switch (b) {
422 case 'b':
423 _value_buf_append(ctx, (const uint8_t*)"\b", 1);
424 break;
425 case 'f':
426 _value_buf_append(ctx, (const uint8_t*)"\f", 1);
427 break;
428 case 'n':
429 _value_buf_append(ctx, (const uint8_t*)"\n", 1);
430 break;
431 case 'r':
432 _value_buf_append(ctx, (const uint8_t*)"\r", 1);
433 break;
434 case 't':
435 _value_buf_append(ctx, (const uint8_t*)"\t", 1);
436 break;
437 case 'u': {
438 // 4 hex digits should follow
439 if (_input_avail(ctx) < 4) {
440 _rewind_bytes(ctx,
441 ctx->input_buf_ptr - (ctx->input_buf_value_start-1));
442 return _set_tok(ctx, JSONT_END);
443 }
444 unsigned long utf16cp = _hex_str_to_ul(ctx->input_buf_ptr, 4);
445 ctx->input_buf_ptr += 4;
446 if (utf16cp == ULONG_MAX) {
447 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_UNICODE_SEQ;
448 return _set_tok(ctx, JSONT_ERR);
449 }
450
451 uint32_t cp = (uint16_t)(0xffff & utf16cp);
452
453 // Is lead surrogate?
454 if (cp >= 0xd800u && cp <= 0xdbffu) {
455 // TODO: Implement pairs by reading another "\uHHHH"
456 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_UNICODE_SEQ;
457 return _set_tok(ctx, JSONT_ERR);
458 }
459
460 // Append UTF-8 byte(s) representing the Unicode codepoint `cp`
461 if (cp < 0x80) {
462 uint8_t cp8 = ((uint8_t)cp);
463 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
464 } else if (cp < 0x800) {
465 uint8_t cp8 = (uint8_t)((cp >> 6) | 0xc0);
466 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
467 cp8 = (uint8_t)((cp & 0x3f) | 0x80);
468 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
469 } else {
470 uint8_t cp8 = (uint8_t)((cp >> 12) | 0xe0);
471 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
472 cp8 = (uint8_t)(((cp >> 6) & 0x3f) | 0x80);
473 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
474 cp8 = (uint8_t)((cp & 0x3f) | 0x80);
475 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
476 }
477
478 break;
479 }
480 default: {
481 _value_buf_append(ctx, &b, 1);
482 break;
483 }
484 } // switch
485
486 } else {
487 // Previous character was NOT an escape character
488
489 if (b == '"') {
490 // Well, this marks the end of a string
491 ctx->input_buf_value_end = ctx->input_buf_ptr-1;
492 return _set_tok(ctx, _expects_field_name(ctx)
493 ? JSONT_FIELD_NAME : JSONT_STRING);
494 break;
495 } else if (b == 0) {
496 // Input buffer ends in the middle of a string
497 _rewind_bytes(ctx,
498 ctx->input_buf_ptr - (ctx->input_buf_value_start-1));
499 return _set_tok(ctx, JSONT_END);
500 } else {
501 if (ctx->value_buf.inuse) {
502 _value_buf_append(ctx, &b, 1);
503 }
504 }
505 }
506 }
507
508 prev_b = b;
509 }
510 }
511 case ',':
512 if ( ctx->curr_tok == JSONT_OBJECT_START
513 || ctx->curr_tok == JSONT_ARRAY_START
514 || ctx->curr_tok == JSONT_END
515 || ctx->curr_tok == JSONT_ERR) {
516 if (ctx->curr_tok != JSONT_ERR)
517 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_COMMA;
518 return _set_tok(ctx, JSONT_ERR);
519 }
520 _set_tok(ctx, _JSONT_COMMA);
521 // read next by simply letting the outer "while" do its thing
522 break;
523
524 case ':':
525 if (ctx->curr_tok != JSONT_FIELD_NAME) {
526 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_COLON;
527 return _set_tok(ctx, JSONT_ERR);
528 }
529 // let the outer "while" do its thing
530 break;
531
532 case ' ': case '\r': case '\n': case '\t':
533 // ignore whitespace and let the outer "while" do its thing
534 break;
535
536 case 0:
537 //printf("** %d\n", __LINE__);
538 return _set_tok(ctx, JSONT_END);
539
540 default:
541 if (isdigit((int)b) || b == '+' || b == '-') {
542 // We are reading a number
543 ctx->input_buf_value_start = ctx->input_buf_ptr-1;
544 //uint8_t prev_b = 0;
545 bool is_float = false;
546 while (1) {
547 b = _next_byte(ctx);
548 if (b == '.') {
549 is_float = true;
550 } else if (!isdigit((int)b)) {
551 _rewind_one_byte(ctx);
552 ctx->input_buf_value_end = ctx->input_buf_ptr;
553 return _set_tok(ctx, is_float ? JSONT_NUMBER_FLOAT
554 : JSONT_NUMBER_INT);
555 } else if (b == 0) {
556 // Input buffer ends before we know that the number-value ended
557 _rewind_bytes(ctx, ctx->input_buf_ptr
558 - (ctx->input_buf_value_start-1));
559 return _set_tok(ctx, JSONT_END);
560 }
561 }
562 }
563
564 ctx->error_info = JSONT_ERRINFO_UNEXPECTED;
565 return _set_tok(ctx, JSONT_ERR);
566 }
567 } // while (1)
568}
569