blob: b318494327b31804ab2b5ae5f462de7232e80485 [file] [log] [blame]
// JSON Tokenizer. Copyright (c) 2012, Rasmus Andersson. All rights reserved.
// Use of this source code is governed by a MIT-style license that can be
// found in the LICENSE file.
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <limits.h>
#include <ctype.h> // isdigit
#include <errno.h>
#include <string.h>
#include <math.h>
#include <assert.h>
// Error info
#ifndef JSONT_ERRINFO_CUSTOM
#define jsont_err_t const char*
#define DEF_EM(NAME, msg) static jsont_err_t JSONT_ERRINFO_##NAME = msg
DEF_EM(STACK_SIZE, "Stack size limit exceeded");
DEF_EM(UNEXPECTED_OBJECT_END,
"Unexpected end of object while not in an object");
DEF_EM(UNEXPECTED_ARRAY_END, "Unexpected end of array while not in an array");
DEF_EM(UNEXPECTED_COMMA, "Unexpected \",\"");
DEF_EM(UNEXPECTED_COLON, "Unexpected \":\"");
DEF_EM(UNEXPECTED, "Unexpected input");
DEF_EM(UNEXPECTED_UNICODE_SEQ, "Malformed unicode encoded sequence in string");
#undef DEF_EM
#endif
// Size of stack used for structures (in/out array and objects). This value
// is a balance between memory size of a ctx and how many levels deep the
// tokenizer can go.
#define _STRUCT_TYPE_STACK_SIZE 512
#define _VALUE_BUF_MIN_SIZE 64
static const uint8_t kHexValueTable[55] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 0-0
-1, -1, -1, -1, -1, -1, -1,
10, 11, 12, 13, 14, 15, // A-F
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1,
10, 11, 12, 13, 14, 15 // a-f
};
typedef uint8_t jsont_tok_t;
typedef struct jsont_ctx {
void* user_data;
const uint8_t* input_buf;
const uint8_t* input_buf_ptr;
size_t input_len;
const uint8_t* input_buf_value_start;
const uint8_t* input_buf_value_end;
struct {
uint8_t* data;
size_t size;
size_t length;
bool inuse;
} value_buf;
jsont_err_t error_info;
jsont_tok_t curr_tok;
size_t st_stack_size;
size_t st_stack_len;
jsont_tok_t st_stack[_STRUCT_TYPE_STACK_SIZE];
} jsont_ctx_t;
#define _JSONT_IN_SOURCE
#include <jsont.h>
unsigned long _hex_str_to_ul(const uint8_t* bytes, size_t len) {
unsigned long value = 0;
unsigned long cutoff = ULONG_MAX / 16;
int cutoff_digit = (int)(ULONG_MAX - cutoff * 16);
for (size_t i = 0; i != len; ++i) {
uint8_t b = bytes[i];
int digit = (b > '0'-1 && b < 'f'+1) ? kHexValueTable[b-'0'] : -1;
if (b == 0xff || // bad digit
(value > cutoff) || // overflow
((value == cutoff) && (digit > cutoff_digit)) ) {
return ULONG_MAX;
} else {
value = (value * 16) + digit;
}
}
return value;
}
jsont_ctx_t* jsont_create(void* user_data) {
jsont_ctx_t* ctx = (jsont_ctx_t*)calloc(1, sizeof(jsont_ctx_t));
ctx->user_data = user_data;
ctx->st_stack_size = _STRUCT_TYPE_STACK_SIZE;
return ctx;
}
void jsont_destroy(jsont_ctx_t* ctx) {
if (ctx->value_buf.data != 0) {
free(ctx->value_buf.data);
}
free(ctx);
}
void jsont_reset(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length) {
ctx->input_buf_ptr = ctx->input_buf = bytes;
ctx->input_len = length;
ctx->st_stack_len = 0;
ctx->curr_tok = JSONT_END;
ctx->input_buf_value_start = 0;
ctx->input_buf_value_end = 0;
ctx->value_buf.length = 0;
ctx->value_buf.inuse = false;
ctx->error_info = 0;
}
jsont_tok_t jsont_current(const jsont_ctx_t* ctx) {
return ctx->curr_tok;
}
void* jsont_user_data(const jsont_ctx_t* ctx) {
return ctx->user_data;
}
// Get the current/last byte read. Suitable for debugging JSONT_ERR
uint8_t jsont_current_byte(jsont_ctx_t* ctx) {
return (ctx->input_buf_ptr == 0) ? 0 : *(ctx->input_buf_ptr-1);
}
size_t jsont_current_offset(jsont_ctx_t* ctx) {
return ctx->input_buf_ptr - ctx->input_buf;
}
jsont_err_t jsont_error_info(jsont_ctx_t* ctx) {
return ctx->error_info;
}
inline static bool _no_value(jsont_ctx_t* ctx) {
return ctx->input_buf_value_start == 0
|| ctx->curr_tok < _JSONT_VALUES_START
|| ctx->curr_tok > _JSONT_VALUES_END;
}
inline static size_t _input_avail(jsont_ctx_t* ctx) {
return ctx->input_len - (ctx->input_buf_ptr - ctx->input_buf);
}
inline static uint8_t _next_byte(jsont_ctx_t* ctx) {
return (_input_avail(ctx) == 0) ? 0 : *(ctx->input_buf_ptr++);
}
inline static jsont_tok_t _st_stack_top(const jsont_ctx_t* ctx) {
return (ctx->st_stack_len != 0) ? ctx->st_stack[ctx->st_stack_len-1]
: JSONT_END;
}
size_t jsont_data_value(jsont_ctx_t* ctx, const uint8_t** bytes) {
if (_no_value(ctx)) {
return 0;
} else {
if (ctx->value_buf.inuse) {
*bytes = ctx->value_buf.data;
return ctx->value_buf.length;
} else {
*bytes = ctx->input_buf_value_start;
return ctx->input_buf_value_end - ctx->input_buf_value_start;
}
}
}
bool jsont_data_equals(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length) {
if (ctx->value_buf.inuse) {
return (ctx->value_buf.length == length) &&
(memcmp((const void*)ctx->value_buf.data,
(const void*)bytes, length) == 0);
} else {
return (ctx->input_buf_value_end - ctx->input_buf_value_start ==
(ssize_t)length) &&
(memcmp((const void *)ctx->input_buf_value_start,
(const void *)bytes, length) == 0);
}
}
char* jsont_strcpy_value(jsont_ctx_t* ctx) {
if (_no_value(ctx)) {
return 0;
} else {
const uint8_t* bytes = 0;
size_t len = jsont_data_value(ctx, &bytes);
char* buf = (char*)malloc(len+1);
if (memcpy((void*)buf, (const void*)bytes, len) != buf) {
return 0;
}
buf[len] = 0;
return buf;
}
}
int64_t jsont_int_value(jsont_ctx_t* ctx) {
if (_no_value(ctx)) {
return INT64_MIN;
}
const uint8_t* start = 0;
size_t len = jsont_data_value(ctx, &start);
if (len == 0) {
return INT64_MIN;
}
const uint8_t* end = start + len + 1;
bool negative;
uint8_t b = *start++;
const int base = 10;
if (b == '-') {
negative = true;
b = *start++;
if (start == end) {
errno = EINVAL;
return INT64_MIN;
}
} else {
negative = false;
if (b == '+') {
b = *start++;
if (start == end) {
errno = EINVAL;
return INT64_MIN;
}
}
}
uint64_t acc = 0;
int any = 0;
uint64_t cutoff = negative
? (uint64_t)-(INT64_MIN + INT64_MAX) + INT64_MAX
: INT64_MAX;
int cutlim = cutoff % base;
cutoff /= base;
for ( ; start != end; b = *start++) {
if (b >= '0' && b <= '9') b -= '0'; else break;
if (any < 0 || acc > cutoff || (acc == cutoff && b > cutlim)) {
any = -1;
} else {
any = 1;
acc *= base;
acc += b;
}
}
if (any < 0) {
acc = negative ? INT64_MIN : INT64_MAX;
errno = ERANGE;
} else if (!any) {
errno = EINVAL;
return INT64_MIN;
} else if (negative) {
acc = -acc;
}
return (int64_t)acc;
}
#ifdef NAN
#define _JSONT_NAN NAN
#else
#define _JSONT_NAN nan(0)
#endif
double jsont_float_value(jsont_ctx_t* ctx) {
// Note: This might cause a segfault if the input is at the end, so we cause
// an error if we try to read a float value while at the end of the input.
if (_no_value(ctx) || _input_avail(ctx) == 0) {
errno = EINVAL;
return _JSONT_NAN;
}
const uint8_t* bytes = 0;
size_t len = jsont_data_value(ctx, &bytes);
if (len == 0) {
return _JSONT_NAN;
}
return atof((const char*)bytes);
}
inline static jsont_tok_t _set_tok(jsont_ctx_t* ctx, jsont_tok_t tok) {
ctx->curr_tok = tok;
if (tok != JSONT_END) {
if (tok == JSONT_OBJECT_START) {
if (ctx->st_stack_len == ctx->st_stack_size) {
ctx->error_info = JSONT_ERRINFO_STACK_SIZE;
return ctx->curr_tok = JSONT_ERR; // TODO: Grow st_stack
}
ctx->st_stack[ctx->st_stack_len++] = JSONT_OBJECT_START;
} else if (tok == JSONT_OBJECT_END) {
if (_st_stack_top(ctx) != JSONT_OBJECT_START) {
ctx->error_info = JSONT_ERRINFO_UNEXPECTED_OBJECT_END;
return ctx->curr_tok = JSONT_ERR;
}
--ctx->st_stack_len;
} else if (tok == JSONT_ARRAY_START) {
if (ctx->st_stack_len == ctx->st_stack_size) {
ctx->error_info = JSONT_ERRINFO_STACK_SIZE;
return ctx->curr_tok = JSONT_ERR;
}
ctx->st_stack[ctx->st_stack_len++] = JSONT_ARRAY_START;
} else if (tok == JSONT_ARRAY_END) {
if (_st_stack_top(ctx) != JSONT_ARRAY_START) {
ctx->error_info = JSONT_ERRINFO_UNEXPECTED_ARRAY_END;
return ctx->curr_tok = JSONT_ERR;
}
--ctx->st_stack_len;
}
}
return tok;
}
inline static void _rewind_one_byte(jsont_ctx_t* ctx) {
--ctx->input_buf_ptr;
}
inline static void _rewind_bytes(jsont_ctx_t* ctx, size_t n) {
ctx->input_buf_ptr -= n;
}
inline static void _skip_bytes(jsont_ctx_t* ctx, size_t n) {
ctx->input_buf_ptr += n;
}
inline static uint8_t _read_atom(jsont_ctx_t* ctx, size_t slacklen,
jsont_tok_t tok) {
if (_input_avail(ctx) < slacklen) {
// rewind and wait for buffer fill
_rewind_one_byte(ctx);
return _set_tok(ctx, JSONT_END);
} else {
_skip_bytes(ctx, slacklen); // e.g. "ull" after "n" or "alse" after "f"
return _set_tok(ctx, tok);
}
}
inline static bool _expects_field_name(jsont_ctx_t* ctx) {
return ( ctx->curr_tok == JSONT_OBJECT_START
|| ( ctx->curr_tok == _JSONT_COMMA
&& _st_stack_top(ctx) == JSONT_OBJECT_START) );
}
static void _value_buf_append(jsont_ctx_t* ctx, const uint8_t* data, size_t len) {
//printf("_value_buf_append(<ctx>, %p, %zu)\n", data, len);
if (ctx->value_buf.size == 0) {
assert(ctx->value_buf.data == 0);
ctx->value_buf.length = len;
ctx->value_buf.size = len * 2;
if (ctx->value_buf.size < _VALUE_BUF_MIN_SIZE) {
ctx->value_buf.size = _VALUE_BUF_MIN_SIZE;
}
ctx->value_buf.data = (uint8_t*)malloc(ctx->value_buf.size);
if (len != 0) {
memcpy(ctx->value_buf.data, data, len);
}
} else {
if (ctx->value_buf.length + len > ctx->value_buf.size) {
size_t new_size = ctx->value_buf.size + (len * 2);
ctx->value_buf.data = realloc(ctx->value_buf.data, new_size);
assert(ctx->value_buf.data != 0);
ctx->value_buf.size = new_size;
}
memcpy(ctx->value_buf.data + ctx->value_buf.length, data, len);
ctx->value_buf.length += len;
}
ctx->value_buf.inuse = true;
}
jsont_tok_t jsont_next(jsont_ctx_t* ctx) {
//
// { } [ ] n t f "
// | | | |
// | | | +- /[^"]*/ "
// | | +- a l s e
// | +- r u e
// +- u l l
//
while (1) {
uint8_t b = _next_byte(ctx);
switch (b) {
case '{': return _set_tok(ctx, JSONT_OBJECT_START);
case '}': return _set_tok(ctx, JSONT_OBJECT_END);
case '[': return _set_tok(ctx, JSONT_ARRAY_START);
case ']': return _set_tok(ctx, JSONT_ARRAY_END);
case 'n': return _read_atom(ctx, 3, JSONT_NULL);
case 't': return _read_atom(ctx, 3, JSONT_TRUE);
case 'f': return _read_atom(ctx, 4, JSONT_FALSE);
case '"': {
ctx->input_buf_value_start = ctx->input_buf_ptr;
ctx->value_buf.inuse = false;
ctx->value_buf.length = 0;
uint8_t prev_b = 0;
while (1) {
b = _next_byte(ctx);
if (b == '\\') {
if (prev_b == '\\') {
// This is an actual '\'.
assert(ctx->value_buf.inuse == true); // should be buffering
_value_buf_append(ctx, ctx->input_buf_ptr-1, 1); // append "\"
} else {
// Okay, this is an escape prefix. Move to buffering value.
if (ctx->value_buf.inuse == 0) {
_value_buf_append(ctx,
ctx->input_buf_value_start,
// any data before the "\":
(ctx->input_buf_ptr-1 - ctx->input_buf_value_start) );
}
}
} else {
// Any byte except '\'
if (prev_b == '\\') {
// Currently just after an escape character
assert(ctx->value_buf.inuse == true); // should be buffering
// JSON specifies a few "magic" characters that have a different
// meaning than their value:
switch (b) {
case 'b':
_value_buf_append(ctx, (const uint8_t*)"\b", 1);
break;
case 'f':
_value_buf_append(ctx, (const uint8_t*)"\f", 1);
break;
case 'n':
_value_buf_append(ctx, (const uint8_t*)"\n", 1);
break;
case 'r':
_value_buf_append(ctx, (const uint8_t*)"\r", 1);
break;
case 't':
_value_buf_append(ctx, (const uint8_t*)"\t", 1);
break;
case 'u': {
// 4 hex digits should follow
if (_input_avail(ctx) < 4) {
_rewind_bytes(ctx,
ctx->input_buf_ptr - (ctx->input_buf_value_start-1));
return _set_tok(ctx, JSONT_END);
}
unsigned long utf16cp = _hex_str_to_ul(ctx->input_buf_ptr, 4);
ctx->input_buf_ptr += 4;
if (utf16cp == ULONG_MAX) {
ctx->error_info = JSONT_ERRINFO_UNEXPECTED_UNICODE_SEQ;
return _set_tok(ctx, JSONT_ERR);
}
uint32_t cp = (uint16_t)(0xffff & utf16cp);
// Is lead surrogate?
if (cp >= 0xd800u && cp <= 0xdbffu) {
// TODO: Implement pairs by reading another "\uHHHH"
ctx->error_info = JSONT_ERRINFO_UNEXPECTED_UNICODE_SEQ;
return _set_tok(ctx, JSONT_ERR);
}
// Append UTF-8 byte(s) representing the Unicode codepoint `cp`
if (cp < 0x80) {
uint8_t cp8 = ((uint8_t)cp);
_value_buf_append(ctx, (const uint8_t*)&cp8, 1);
} else if (cp < 0x800) {
uint8_t cp8 = (uint8_t)((cp >> 6) | 0xc0);
_value_buf_append(ctx, (const uint8_t*)&cp8, 1);
cp8 = (uint8_t)((cp & 0x3f) | 0x80);
_value_buf_append(ctx, (const uint8_t*)&cp8, 1);
} else {
uint8_t cp8 = (uint8_t)((cp >> 12) | 0xe0);
_value_buf_append(ctx, (const uint8_t*)&cp8, 1);
cp8 = (uint8_t)(((cp >> 6) & 0x3f) | 0x80);
_value_buf_append(ctx, (const uint8_t*)&cp8, 1);
cp8 = (uint8_t)((cp & 0x3f) | 0x80);
_value_buf_append(ctx, (const uint8_t*)&cp8, 1);
}
break;
}
default: {
_value_buf_append(ctx, &b, 1);
break;
}
} // switch
} else {
// Previous character was NOT an escape character
if (b == '"') {
// Well, this marks the end of a string
ctx->input_buf_value_end = ctx->input_buf_ptr-1;
return _set_tok(ctx, _expects_field_name(ctx)
? JSONT_FIELD_NAME : JSONT_STRING);
break;
} else if (b == 0) {
// Input buffer ends in the middle of a string
_rewind_bytes(ctx,
ctx->input_buf_ptr - (ctx->input_buf_value_start-1));
return _set_tok(ctx, JSONT_END);
} else {
if (ctx->value_buf.inuse) {
_value_buf_append(ctx, &b, 1);
}
}
}
}
prev_b = b;
}
}
case ',':
if ( ctx->curr_tok == JSONT_OBJECT_START
|| ctx->curr_tok == JSONT_ARRAY_START
|| ctx->curr_tok == JSONT_END
|| ctx->curr_tok == JSONT_ERR) {
if (ctx->curr_tok != JSONT_ERR)
ctx->error_info = JSONT_ERRINFO_UNEXPECTED_COMMA;
return _set_tok(ctx, JSONT_ERR);
}
_set_tok(ctx, _JSONT_COMMA);
// read next by simply letting the outer "while" do its thing
break;
case ':':
if (ctx->curr_tok != JSONT_FIELD_NAME) {
ctx->error_info = JSONT_ERRINFO_UNEXPECTED_COLON;
return _set_tok(ctx, JSONT_ERR);
}
// let the outer "while" do its thing
break;
case ' ': case '\r': case '\n': case '\t':
// ignore whitespace and let the outer "while" do its thing
break;
case 0:
//printf("** %d\n", __LINE__);
return _set_tok(ctx, JSONT_END);
default:
if (isdigit((int)b) || b == '+' || b == '-') {
// We are reading a number
ctx->input_buf_value_start = ctx->input_buf_ptr-1;
//uint8_t prev_b = 0;
bool is_float = false;
while (1) {
b = _next_byte(ctx);
if (b == '.') {
is_float = true;
} else if (!isdigit((int)b)) {
_rewind_one_byte(ctx);
ctx->input_buf_value_end = ctx->input_buf_ptr;
return _set_tok(ctx, is_float ? JSONT_NUMBER_FLOAT
: JSONT_NUMBER_INT);
} else if (b == 0) {
// Input buffer ends before we know that the number-value ended
_rewind_bytes(ctx, ctx->input_buf_ptr
- (ctx->input_buf_value_start-1));
return _set_tok(ctx, JSONT_END);
}
}
}
ctx->error_info = JSONT_ERRINFO_UNEXPECTED;
return _set_tok(ctx, JSONT_ERR);
}
} // while (1)
}