blob: b318494327b31804ab2b5ae5f462de7232e80485 [file] [log] [blame]
Austin Schuhf417eaf2019-09-16 21:58:36 -07001// JSON Tokenizer. Copyright (c) 2012, Rasmus Andersson. All rights reserved.
2// Use of this source code is governed by a MIT-style license that can be
3// found in the LICENSE file.
4#include <stdlib.h>
5#include <stdint.h>
6#include <stdbool.h>
7#include <limits.h>
8#include <ctype.h> // isdigit
9#include <errno.h>
10#include <string.h>
11#include <math.h>
12#include <assert.h>
13
14// Error info
15#ifndef JSONT_ERRINFO_CUSTOM
16#define jsont_err_t const char*
17#define DEF_EM(NAME, msg) static jsont_err_t JSONT_ERRINFO_##NAME = msg
18DEF_EM(STACK_SIZE, "Stack size limit exceeded");
19DEF_EM(UNEXPECTED_OBJECT_END,
20 "Unexpected end of object while not in an object");
21DEF_EM(UNEXPECTED_ARRAY_END, "Unexpected end of array while not in an array");
22DEF_EM(UNEXPECTED_COMMA, "Unexpected \",\"");
23DEF_EM(UNEXPECTED_COLON, "Unexpected \":\"");
24DEF_EM(UNEXPECTED, "Unexpected input");
25DEF_EM(UNEXPECTED_UNICODE_SEQ, "Malformed unicode encoded sequence in string");
26#undef DEF_EM
27#endif
28
29// Size of stack used for structures (in/out array and objects). This value
30// is a balance between memory size of a ctx and how many levels deep the
31// tokenizer can go.
32#define _STRUCT_TYPE_STACK_SIZE 512
33#define _VALUE_BUF_MIN_SIZE 64
34
35static const uint8_t kHexValueTable[55] = {
36 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 0-0
37 -1, -1, -1, -1, -1, -1, -1,
38 10, 11, 12, 13, 14, 15, // A-F
39 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
40 -1, -1, -1, -1, -1, -1, -1,
41 10, 11, 12, 13, 14, 15 // a-f
42};
43
44typedef uint8_t jsont_tok_t;
45
46typedef struct jsont_ctx {
47 void* user_data;
48 const uint8_t* input_buf;
49 const uint8_t* input_buf_ptr;
50 size_t input_len;
51 const uint8_t* input_buf_value_start;
52 const uint8_t* input_buf_value_end;
53 struct {
54 uint8_t* data;
55 size_t size;
56 size_t length;
57 bool inuse;
58 } value_buf;
59 jsont_err_t error_info;
60 jsont_tok_t curr_tok;
61 size_t st_stack_size;
62 size_t st_stack_len;
63 jsont_tok_t st_stack[_STRUCT_TYPE_STACK_SIZE];
64} jsont_ctx_t;
65
66#define _JSONT_IN_SOURCE
67#include <jsont.h>
68
69unsigned long _hex_str_to_ul(const uint8_t* bytes, size_t len) {
70 unsigned long value = 0;
71 unsigned long cutoff = ULONG_MAX / 16;
72 int cutoff_digit = (int)(ULONG_MAX - cutoff * 16);
73
74 for (size_t i = 0; i != len; ++i) {
75 uint8_t b = bytes[i];
76 int digit = (b > '0'-1 && b < 'f'+1) ? kHexValueTable[b-'0'] : -1;
Austin Schuh3e95e5d2019-09-20 00:08:54 -070077 if (b == 0xff || // bad digit
Austin Schuhf417eaf2019-09-16 21:58:36 -070078 (value > cutoff) || // overflow
79 ((value == cutoff) && (digit > cutoff_digit)) ) {
80 return ULONG_MAX;
81 } else {
82 value = (value * 16) + digit;
83 }
84 }
85
86 return value;
87}
88
89jsont_ctx_t* jsont_create(void* user_data) {
90 jsont_ctx_t* ctx = (jsont_ctx_t*)calloc(1, sizeof(jsont_ctx_t));
91 ctx->user_data = user_data;
92 ctx->st_stack_size = _STRUCT_TYPE_STACK_SIZE;
93 return ctx;
94}
95
96void jsont_destroy(jsont_ctx_t* ctx) {
97 if (ctx->value_buf.data != 0) {
98 free(ctx->value_buf.data);
99 }
100 free(ctx);
101}
102
103void jsont_reset(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length) {
104 ctx->input_buf_ptr = ctx->input_buf = bytes;
105 ctx->input_len = length;
106 ctx->st_stack_len = 0;
107 ctx->curr_tok = JSONT_END;
108 ctx->input_buf_value_start = 0;
109 ctx->input_buf_value_end = 0;
110 ctx->value_buf.length = 0;
111 ctx->value_buf.inuse = false;
112 ctx->error_info = 0;
113}
114
115jsont_tok_t jsont_current(const jsont_ctx_t* ctx) {
116 return ctx->curr_tok;
117}
118
119void* jsont_user_data(const jsont_ctx_t* ctx) {
120 return ctx->user_data;
121}
122
123// Get the current/last byte read. Suitable for debugging JSONT_ERR
124uint8_t jsont_current_byte(jsont_ctx_t* ctx) {
125 return (ctx->input_buf_ptr == 0) ? 0 : *(ctx->input_buf_ptr-1);
126}
127
128size_t jsont_current_offset(jsont_ctx_t* ctx) {
129 return ctx->input_buf_ptr - ctx->input_buf;
130}
131
132jsont_err_t jsont_error_info(jsont_ctx_t* ctx) {
133 return ctx->error_info;
134}
135
136inline static bool _no_value(jsont_ctx_t* ctx) {
137 return ctx->input_buf_value_start == 0
138 || ctx->curr_tok < _JSONT_VALUES_START
139 || ctx->curr_tok > _JSONT_VALUES_END;
140}
141
142inline static size_t _input_avail(jsont_ctx_t* ctx) {
143 return ctx->input_len - (ctx->input_buf_ptr - ctx->input_buf);
144}
145
146inline static uint8_t _next_byte(jsont_ctx_t* ctx) {
147 return (_input_avail(ctx) == 0) ? 0 : *(ctx->input_buf_ptr++);
148}
149
150inline static jsont_tok_t _st_stack_top(const jsont_ctx_t* ctx) {
151 return (ctx->st_stack_len != 0) ? ctx->st_stack[ctx->st_stack_len-1]
152 : JSONT_END;
153}
154
155size_t jsont_data_value(jsont_ctx_t* ctx, const uint8_t** bytes) {
156 if (_no_value(ctx)) {
157 return 0;
158 } else {
159 if (ctx->value_buf.inuse) {
160 *bytes = ctx->value_buf.data;
161 return ctx->value_buf.length;
162 } else {
163 *bytes = ctx->input_buf_value_start;
164 return ctx->input_buf_value_end - ctx->input_buf_value_start;
165 }
166 }
167}
168
169bool jsont_data_equals(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length) {
170 if (ctx->value_buf.inuse) {
171 return (ctx->value_buf.length == length) &&
172 (memcmp((const void*)ctx->value_buf.data,
173 (const void*)bytes, length) == 0);
174 } else {
Austin Schuh3e95e5d2019-09-20 00:08:54 -0700175 return (ctx->input_buf_value_end - ctx->input_buf_value_start ==
176 (ssize_t)length) &&
177 (memcmp((const void *)ctx->input_buf_value_start,
178 (const void *)bytes, length) == 0);
Austin Schuhf417eaf2019-09-16 21:58:36 -0700179 }
180}
181
182char* jsont_strcpy_value(jsont_ctx_t* ctx) {
183 if (_no_value(ctx)) {
184 return 0;
185 } else {
186 const uint8_t* bytes = 0;
187 size_t len = jsont_data_value(ctx, &bytes);
188 char* buf = (char*)malloc(len+1);
189 if (memcpy((void*)buf, (const void*)bytes, len) != buf) {
190 return 0;
191 }
192 buf[len] = 0;
193 return buf;
194 }
195}
196
197int64_t jsont_int_value(jsont_ctx_t* ctx) {
198 if (_no_value(ctx)) {
199 return INT64_MIN;
200 }
201
202 const uint8_t* start = 0;
203 size_t len = jsont_data_value(ctx, &start);
204 if (len == 0) {
205 return INT64_MIN;
206 }
207 const uint8_t* end = start + len + 1;
208
209 bool negative;
210 uint8_t b = *start++;
211 const int base = 10;
212
213 if (b == '-') {
214 negative = true;
215 b = *start++;
216 if (start == end) {
217 errno = EINVAL;
218 return INT64_MIN;
219 }
220 } else {
221 negative = false;
222 if (b == '+') {
223 b = *start++;
224 if (start == end) {
225 errno = EINVAL;
226 return INT64_MIN;
227 }
228 }
229 }
230
231 uint64_t acc = 0;
232 int any = 0;
233 uint64_t cutoff = negative
234 ? (uint64_t)-(INT64_MIN + INT64_MAX) + INT64_MAX
235 : INT64_MAX;
236 int cutlim = cutoff % base;
237 cutoff /= base;
238 for ( ; start != end; b = *start++) {
239 if (b >= '0' && b <= '9') b -= '0'; else break;
240 if (any < 0 || acc > cutoff || (acc == cutoff && b > cutlim)) {
241 any = -1;
242 } else {
243 any = 1;
244 acc *= base;
245 acc += b;
246 }
247 }
248
249 if (any < 0) {
250 acc = negative ? INT64_MIN : INT64_MAX;
251 errno = ERANGE;
252 } else if (!any) {
253 errno = EINVAL;
254 return INT64_MIN;
255 } else if (negative) {
256 acc = -acc;
257 }
258
259 return (int64_t)acc;
260}
261
262#ifdef NAN
263 #define _JSONT_NAN NAN
264#else
265 #define _JSONT_NAN nan(0)
266#endif
267
268double jsont_float_value(jsont_ctx_t* ctx) {
269 // Note: This might cause a segfault if the input is at the end, so we cause
270 // an error if we try to read a float value while at the end of the input.
271 if (_no_value(ctx) || _input_avail(ctx) == 0) {
272 errno = EINVAL;
273 return _JSONT_NAN;
274 }
275
276 const uint8_t* bytes = 0;
277 size_t len = jsont_data_value(ctx, &bytes);
278 if (len == 0) {
279 return _JSONT_NAN;
280 }
281 return atof((const char*)bytes);
282}
283
284inline static jsont_tok_t _set_tok(jsont_ctx_t* ctx, jsont_tok_t tok) {
285 ctx->curr_tok = tok;
286
287 if (tok != JSONT_END) {
288 if (tok == JSONT_OBJECT_START) {
289 if (ctx->st_stack_len == ctx->st_stack_size) {
290 ctx->error_info = JSONT_ERRINFO_STACK_SIZE;
291 return ctx->curr_tok = JSONT_ERR; // TODO: Grow st_stack
292 }
293 ctx->st_stack[ctx->st_stack_len++] = JSONT_OBJECT_START;
294
295 } else if (tok == JSONT_OBJECT_END) {
296 if (_st_stack_top(ctx) != JSONT_OBJECT_START) {
297 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_OBJECT_END;
298 return ctx->curr_tok = JSONT_ERR;
299 }
300 --ctx->st_stack_len;
301
302 } else if (tok == JSONT_ARRAY_START) {
303 if (ctx->st_stack_len == ctx->st_stack_size) {
304 ctx->error_info = JSONT_ERRINFO_STACK_SIZE;
305 return ctx->curr_tok = JSONT_ERR;
306 }
307 ctx->st_stack[ctx->st_stack_len++] = JSONT_ARRAY_START;
308
309 } else if (tok == JSONT_ARRAY_END) {
310 if (_st_stack_top(ctx) != JSONT_ARRAY_START) {
311 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_ARRAY_END;
312 return ctx->curr_tok = JSONT_ERR;
313 }
314 --ctx->st_stack_len;
315 }
316 }
317
318 return tok;
319}
320inline static void _rewind_one_byte(jsont_ctx_t* ctx) {
321 --ctx->input_buf_ptr;
322}
323inline static void _rewind_bytes(jsont_ctx_t* ctx, size_t n) {
324 ctx->input_buf_ptr -= n;
325}
326inline static void _skip_bytes(jsont_ctx_t* ctx, size_t n) {
327 ctx->input_buf_ptr += n;
328}
329inline static uint8_t _read_atom(jsont_ctx_t* ctx, size_t slacklen,
330 jsont_tok_t tok) {
331 if (_input_avail(ctx) < slacklen) {
332 // rewind and wait for buffer fill
333 _rewind_one_byte(ctx);
334 return _set_tok(ctx, JSONT_END);
335 } else {
336 _skip_bytes(ctx, slacklen); // e.g. "ull" after "n" or "alse" after "f"
337 return _set_tok(ctx, tok);
338 }
339}
340inline static bool _expects_field_name(jsont_ctx_t* ctx) {
341 return ( ctx->curr_tok == JSONT_OBJECT_START
342 || ( ctx->curr_tok == _JSONT_COMMA
343 && _st_stack_top(ctx) == JSONT_OBJECT_START) );
344}
345
346static void _value_buf_append(jsont_ctx_t* ctx, const uint8_t* data, size_t len) {
347 //printf("_value_buf_append(<ctx>, %p, %zu)\n", data, len);
348 if (ctx->value_buf.size == 0) {
349 assert(ctx->value_buf.data == 0);
350 ctx->value_buf.length = len;
351 ctx->value_buf.size = len * 2;
352 if (ctx->value_buf.size < _VALUE_BUF_MIN_SIZE) {
353 ctx->value_buf.size = _VALUE_BUF_MIN_SIZE;
354 }
355 ctx->value_buf.data = (uint8_t*)malloc(ctx->value_buf.size);
356 if (len != 0) {
357 memcpy(ctx->value_buf.data, data, len);
358 }
359 } else {
360 if (ctx->value_buf.length + len > ctx->value_buf.size) {
361 size_t new_size = ctx->value_buf.size + (len * 2);
362 ctx->value_buf.data = realloc(ctx->value_buf.data, new_size);
363 assert(ctx->value_buf.data != 0);
364 ctx->value_buf.size = new_size;
365 }
366 memcpy(ctx->value_buf.data + ctx->value_buf.length, data, len);
367 ctx->value_buf.length += len;
368 }
369 ctx->value_buf.inuse = true;
370}
371
372jsont_tok_t jsont_next(jsont_ctx_t* ctx) {
373 //
374 // { } [ ] n t f "
375 // | | | |
376 // | | | +- /[^"]*/ "
377 // | | +- a l s e
378 // | +- r u e
379 // +- u l l
380 //
381 while (1) {
382 uint8_t b = _next_byte(ctx);
383 switch (b) {
384 case '{': return _set_tok(ctx, JSONT_OBJECT_START);
385 case '}': return _set_tok(ctx, JSONT_OBJECT_END);
386 case '[': return _set_tok(ctx, JSONT_ARRAY_START);
387 case ']': return _set_tok(ctx, JSONT_ARRAY_END);
388 case 'n': return _read_atom(ctx, 3, JSONT_NULL);
389 case 't': return _read_atom(ctx, 3, JSONT_TRUE);
390 case 'f': return _read_atom(ctx, 4, JSONT_FALSE);
391 case '"': {
392 ctx->input_buf_value_start = ctx->input_buf_ptr;
393 ctx->value_buf.inuse = false;
394 ctx->value_buf.length = 0;
395 uint8_t prev_b = 0;
396 while (1) {
397 b = _next_byte(ctx);
398
399 if (b == '\\') {
400 if (prev_b == '\\') {
401 // This is an actual '\'.
402 assert(ctx->value_buf.inuse == true); // should be buffering
403 _value_buf_append(ctx, ctx->input_buf_ptr-1, 1); // append "\"
404 } else {
405 // Okay, this is an escape prefix. Move to buffering value.
406 if (ctx->value_buf.inuse == 0) {
407 _value_buf_append(ctx,
408 ctx->input_buf_value_start,
409 // any data before the "\":
410 (ctx->input_buf_ptr-1 - ctx->input_buf_value_start) );
411 }
412 }
413 } else {
414 // Any byte except '\'
415
416 if (prev_b == '\\') {
417 // Currently just after an escape character
418 assert(ctx->value_buf.inuse == true); // should be buffering
419
420 // JSON specifies a few "magic" characters that have a different
421 // meaning than their value:
422 switch (b) {
423 case 'b':
424 _value_buf_append(ctx, (const uint8_t*)"\b", 1);
425 break;
426 case 'f':
427 _value_buf_append(ctx, (const uint8_t*)"\f", 1);
428 break;
429 case 'n':
430 _value_buf_append(ctx, (const uint8_t*)"\n", 1);
431 break;
432 case 'r':
433 _value_buf_append(ctx, (const uint8_t*)"\r", 1);
434 break;
435 case 't':
436 _value_buf_append(ctx, (const uint8_t*)"\t", 1);
437 break;
438 case 'u': {
439 // 4 hex digits should follow
440 if (_input_avail(ctx) < 4) {
441 _rewind_bytes(ctx,
442 ctx->input_buf_ptr - (ctx->input_buf_value_start-1));
443 return _set_tok(ctx, JSONT_END);
444 }
445 unsigned long utf16cp = _hex_str_to_ul(ctx->input_buf_ptr, 4);
446 ctx->input_buf_ptr += 4;
447 if (utf16cp == ULONG_MAX) {
448 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_UNICODE_SEQ;
449 return _set_tok(ctx, JSONT_ERR);
450 }
451
452 uint32_t cp = (uint16_t)(0xffff & utf16cp);
453
454 // Is lead surrogate?
455 if (cp >= 0xd800u && cp <= 0xdbffu) {
456 // TODO: Implement pairs by reading another "\uHHHH"
457 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_UNICODE_SEQ;
458 return _set_tok(ctx, JSONT_ERR);
459 }
460
461 // Append UTF-8 byte(s) representing the Unicode codepoint `cp`
462 if (cp < 0x80) {
463 uint8_t cp8 = ((uint8_t)cp);
464 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
465 } else if (cp < 0x800) {
466 uint8_t cp8 = (uint8_t)((cp >> 6) | 0xc0);
467 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
468 cp8 = (uint8_t)((cp & 0x3f) | 0x80);
469 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
470 } else {
471 uint8_t cp8 = (uint8_t)((cp >> 12) | 0xe0);
472 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
473 cp8 = (uint8_t)(((cp >> 6) & 0x3f) | 0x80);
474 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
475 cp8 = (uint8_t)((cp & 0x3f) | 0x80);
476 _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
477 }
478
479 break;
480 }
481 default: {
482 _value_buf_append(ctx, &b, 1);
483 break;
484 }
485 } // switch
486
487 } else {
488 // Previous character was NOT an escape character
489
490 if (b == '"') {
491 // Well, this marks the end of a string
492 ctx->input_buf_value_end = ctx->input_buf_ptr-1;
493 return _set_tok(ctx, _expects_field_name(ctx)
494 ? JSONT_FIELD_NAME : JSONT_STRING);
495 break;
496 } else if (b == 0) {
497 // Input buffer ends in the middle of a string
498 _rewind_bytes(ctx,
499 ctx->input_buf_ptr - (ctx->input_buf_value_start-1));
500 return _set_tok(ctx, JSONT_END);
501 } else {
502 if (ctx->value_buf.inuse) {
503 _value_buf_append(ctx, &b, 1);
504 }
505 }
506 }
507 }
508
509 prev_b = b;
510 }
511 }
512 case ',':
513 if ( ctx->curr_tok == JSONT_OBJECT_START
514 || ctx->curr_tok == JSONT_ARRAY_START
515 || ctx->curr_tok == JSONT_END
516 || ctx->curr_tok == JSONT_ERR) {
517 if (ctx->curr_tok != JSONT_ERR)
518 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_COMMA;
519 return _set_tok(ctx, JSONT_ERR);
520 }
521 _set_tok(ctx, _JSONT_COMMA);
522 // read next by simply letting the outer "while" do its thing
523 break;
524
525 case ':':
526 if (ctx->curr_tok != JSONT_FIELD_NAME) {
527 ctx->error_info = JSONT_ERRINFO_UNEXPECTED_COLON;
528 return _set_tok(ctx, JSONT_ERR);
529 }
530 // let the outer "while" do its thing
531 break;
532
533 case ' ': case '\r': case '\n': case '\t':
534 // ignore whitespace and let the outer "while" do its thing
535 break;
536
537 case 0:
538 //printf("** %d\n", __LINE__);
539 return _set_tok(ctx, JSONT_END);
540
541 default:
542 if (isdigit((int)b) || b == '+' || b == '-') {
543 // We are reading a number
544 ctx->input_buf_value_start = ctx->input_buf_ptr-1;
545 //uint8_t prev_b = 0;
546 bool is_float = false;
547 while (1) {
548 b = _next_byte(ctx);
549 if (b == '.') {
550 is_float = true;
551 } else if (!isdigit((int)b)) {
552 _rewind_one_byte(ctx);
553 ctx->input_buf_value_end = ctx->input_buf_ptr;
554 return _set_tok(ctx, is_float ? JSONT_NUMBER_FLOAT
555 : JSONT_NUMBER_INT);
556 } else if (b == 0) {
557 // Input buffer ends before we know that the number-value ended
558 _rewind_bytes(ctx, ctx->input_buf_ptr
559 - (ctx->input_buf_value_start-1));
560 return _set_tok(ctx, JSONT_END);
561 }
562 }
563 }
564
565 ctx->error_info = JSONT_ERRINFO_UNEXPECTED;
566 return _set_tok(ctx, JSONT_ERR);
567 }
568 } // while (1)
569}
570