Squashed 'third_party/jsont/' content from commit 1536152d7
Change-Id: I51a80190772b74ca0d45fd3fadc130e872b57cc0
git-subtree-dir: third_party/jsont
git-subtree-split: 1536152d7c1926448d42e4a691acd9a15940b20c
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ecee4c9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+example1
+example2
+*.d
+.objs
+x*
+test/build
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..03c7813
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2012 Rasmus Andersson <http://rsms.me/>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..848cdc2
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,57 @@
+c_sources := jsont.c
+
+all: example1 example2 test
+
+object_dir = .objs
+objects = $(patsubst %,$(object_dir)/%,${c_sources:.c=.o})
+object_dirs = $(sort $(foreach fn,$(objects),$(dir $(fn))))
+-include ${objects:.o=.d}
+
+test_dir = test
+test_sources := $(wildcard test/test*.c)
+test_object_dir = $(test_dir)/.objs
+test_build_dir = $(test_dir)/build
+test_objects = $(patsubst test/%,$(test_object_dir)/%,${test_sources:.c=.o})
+test_programs = $(patsubst test/%.c,$(test_build_dir)/%,$(test_sources))
+test_object_dirs = $(sort $(foreach fn,$(test_objects),$(dir $(fn))))
+
+CC = clang
+LD = clang
+
+CFLAGS += -Wall -g -MMD -std=c99 -I.
+TEST_CFLAGS := $(CFLAGS) -O0
+#LDFLAGS +=
+ifneq ($(DEBUG),)
+ CFLAGS += -O0 -DDEBUG=1
+else
+ CFLAGS += -O3 -DNDEBUG
+endif
+
+clean:
+ rm -f jsont example1 example2
+ rm -rf $(object_dir)
+ rm -rf $(test_object_dir)
+ rm -rf $(test_build_dir)
+
+example1: $(objects) $(object_dir)/example1.o
+ $(LD) $(LDFLAGS) -o $@ $^
+
+example2: $(objects) $(object_dir)/example2.o
+ $(LD) $(LDFLAGS) -o $@ $^
+
+test: $(objects) $(test_programs)
+ $(test_programs)
+
+$(test_build_dir)/%: $(objects) $(test_object_dir)/%.o
+ @mkdir -p `dirname $@`
+ $(LD) $(LDFLAGS) -o $@ $^
+
+$(test_object_dir)/%.o: $(test_dir)/%.c
+ @mkdir -p `dirname $@`
+ $(CC) $(TEST_CFLAGS) -c -o $@ $<
+
+$(object_dir)/%.o: %.c
+ @mkdir -p `dirname $@`
+ $(CC) $(CFLAGS) -c -o $@ $<
+
+.PHONY: clean all test
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..34ec56e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,209 @@
+# JSON Tokenizer (jsont)
+
+A minimal and portable JSON tokenizer written in standard C and C++ (two separate versions). Performs validating and highly efficient parsing suitable for reading JSON directly into custom data structures. There are no code dependencies — simply include `jsont.{h,hh,c,cc}` in your project.
+
+Build and run unit tests:
+
+ make
+
+## Synopsis
+
+C API:
+
+```c
+jsont_ctx_t* S = jsont_create(0);
+jsont_reset(S, uint8_t* inbuf, size_t inbuf_len);
+tok = jsont_next(S)
+// branch on `tok` ...
+V = jsont_*_value(S[, ...]);
+jsont_destroy(S);
+```
+
+New C++ API:
+
+```cc
+jsont::Tokenizer S(const char* inbuf, size_t length);
+jsont::Token token;
+while ((token = S.next())) {
+ if (token == jsont::Float) {
+ printf("%g\n", S.floatValue());
+ } ... else if (t == jsont::Error) {
+ // handle error
+ break;
+ }
+}
+```
+
+```cc
+jsont::Builder json;
+json.startObject()
+ .fieldName("foo").value(123.45)
+ .fieldName("bar").startArray()
+ .value(678)
+ .value("nine \"ten\"")
+ .endArray()
+ .endObject();
+std::cout << json.toString() << std::endl;
+// {"foo":123.45,"bar":[678,"nine \"ten\""]}
+```
+
+# API overview
+
+See `jsont.h` and `jsont.hh` for a complete overview of the API, incuding more detailed documentation. Here's an overview:
+
+## C++ API `namespace jsont`
+
+- `Builder build()` — convenience builder factory
+
+### class Tokenizer
+
+Reads a sequence of bytes and produces tokens and values while doing so.
+
+- `Tokenizer(const char* bytes, size_t length, TextEncoding encoding)` — initialize a new Tokenizer to read `bytes` of `length` in `encoding`
+- `void reset(const char* bytes, size_t length, TextEncoding encoding)` — Reset the tokenizer, making it possible to reuse this parser so to avoid unnecessary memory allocation and deallocation.
+
+#### Reading tokens
+
+- `const Token& next() throw(Error)` — Read next token, possibly throwing an `Error`
+- `const Token& current() const` — Access current token
+
+#### Reading values
+
+- `bool hasValue() const` — True if the current token has a value
+- `size_t dataValue(const char const** bytes)` — Returns a slice of the input which represents the current value, or nothing (returns 0) if the current token has no value (e.g. start of an object).
+- `std::string stringValue() const` — Returns a *copy* of the current string value.
+- `double floatValue() const` — Returns the current value as a double-precision floating-point number.
+- `int64_t intValue() const` — Returns the current value as a signed 64-bit integer.
+
+#### Handling errors
+
+- `ErrorCode error() const` — Returns the error code of the last error
+- `const char* errorMessage() const` — Returns a human-readable message for the last error. Never returns NULL.
+
+#### Acessing underlying input buffer
+
+- `const char* inputBytes() const` — A pointer to the input data as passed to `reset` or the constructor.
+- `size_t inputSize() const` — Total number of input bytes
+- `size_t inputOffset() const` — The byte offset into input where the tokenizer is currently at. In the event of an error, this will point to the source of the error.
+
+### enum Token
+
+- `End` — Input ended
+- `ObjectStart` — {
+- `ObjectEnd` — }
+- `ArrayStart` — [
+- `ArrayEnd` — ]
+- `True` — true
+- `False` — false
+- `Null` — null
+- `Integer` — number value without a fraction part (access as int64 through `Tokenizer::intValue()`)
+- `Float` — number value with a fraction part (access as double through `Tokenizer::floatValue()`)
+- `String` — string value (access value through `Tokenizer::stringValue()` et al)
+- `FieldName` — field name (access value through `Tokenizer::stringValue()` et al)
+- `Error` — an error occured (access error code through `Tokenizer::error()` et al)
+
+### enum TextEncoding
+
+- `UTF8TextEncoding` — Unicode UTF-8 text encoding
+
+### enum Tokenizer::ErrorCode
+
+- `UnspecifiedError` — Unspecified error
+- `UnexpectedComma` — Unexpected comma
+- `UnexpectedTrailingComma` — Unexpected trailing comma
+- `InvalidByte` — Invalid input byte
+- `PrematureEndOfInput` — Premature end of input
+- `MalformedUnicodeEscapeSequence` — Malformed Unicode escape sequence
+- `MalformedNumberLiteral` — Malformed number literal
+- `UnterminatedString` — Unterminated string
+- `SyntaxError` — Illegal JSON (syntax error)
+
+### class Builder
+
+Aids in building JSON, providing a final sequential byte buffer.
+
+- `Builder()` — initialize a new builder with an empty backing buffer
+- `Builder& startObject()` — Start an object (appends a `'{'` character to the backing buffer)
+- `Builder& endObject()` — End an object (a `'}'` character)
+- `Builder& startArray()` — Start an array (`'['`)
+- `Builder& endArray()` — End an array (`']'`)
+- `const void reset()` — Reset the builder to its neutral state. Note that the backing buffer is reused in this case.
+
+#### Building
+
+- `Builder& fieldName(const char* v, size_t length, TextEncoding encoding=UTF8TextEncoding)` — Adds a field name by copying `length` bytes from `v`.
+- `Builder& fieldName(const std::string& name, TextEncoding encoding=UTF8TextEncoding)` — Adds a field name by copying `name`.
+- `Builder& value(const char* v, size_t length, TextEncoding encoding=UTF8TextEncoding)` — Adds a string value by copying `length` bytes from `v` which content is encoded according to `encoding`.
+- `Builder& value(const char* v)` — Adds a string value by copying `strlen(v)` bytes from c-string `v`. Uses the default encoding of `value(const char*,size_t,TextEncoding)`.
+- `Builder& value(const std::string& v)` — Adds a string value by copying `v`. Uses the default encoding of `value(const char*,size_t,TextEncoding)`.
+- `Builder& value(double v)` — Adds a possibly fractional number
+- `Builder& value(int64_t v)`, `void value(int v)`, `void value(unsigned int v)`, `void value(long v)` — Adds an integer number
+- `Builder& value(bool v)` — Adds the "true" or "false" atom, depending on `v`
+- `Builder& nullValue()` — Adds the "null" atom
+
+#### Managing the result
+
+- `size_t size() const` — Number of readable bytes at the pointer returned by `bytes()`
+- `const char* bytes() const` — Pointer to the backing buffer, holding the resulting JSON.
+- `std::string toString() const` — Return a `std::string` object holding a copy of the backing buffer, representing the JSON.
+- `const char* seizeBytes(size_t& size_out)` — "Steal" the backing buffer. After this call, the caller is responsible for calling `free()` on the returned pointer. Returns NULL on failure. Sets the value of `size_out` to the number of readable bytes at the returned pointer. The builder will be reset and ready to use (which will act on a new backing buffer).
+
+----
+
+## C API
+
+### Types
+
+- `jsont_ctx_t` — A tokenizer context ("instance" in OOP lingo.)
+- `jsont_tok_t` — A token type (see "Token types".)
+- `jsont_err_t` — A user-configurable error type, which defaults to `const char*`.
+
+### Managing a tokenizer context
+
+- `jsont_ctx_t* jsont_create(void* user_data)` — Create a new JSON tokenizer context.
+- `void jsont_destroy(jsont_ctx_t* ctx)` — Destroy a JSON tokenizer context.
+- `void jsont_reset(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length)` — Reset the tokenizer to parse the data pointed to by `bytes`.
+
+### Dealing with tokens
+
+- `jsont_tok_t jsont_next(jsont_ctx_t* ctx)` — Read and return the next token.
+- `jsont_tok_t jsont_current(const jsont_ctx_t* ctx)` — Returns the current token (last token read by `jsont_next`).
+
+### Accessing and comparing values
+
+- `int64_t jsont_int_value(jsont_ctx_t* ctx)` — Returns the current integer value.
+- `double jsont_float_value(jsont_ctx_t* ctx)` — Returns the current floating-point number value.
+- `size_t jsont_data_value(jsont_ctx_t* ctx, const uint8_t** bytes)` — Returns a slice of the input which represents the current value.
+- `char* jsont_strcpy_value(jsont_ctx_t* ctx)` — Retrieve a newly allocated c-string.
+- `bool jsont_data_equals(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length)` — Returns true if the current data value is equal to `bytes` of `length`
+- `bool jsont_str_equals(jsont_ctx_t* ctx, const char* str)` — Returns true if the current data value is equal to c string `str`.
+
+Note that the data is not parsed until you call one of these functions. This means that if you know that a value transferred as a string will fit in a 64-bit signed integer, it's completely valid to call `jsont_int_value` to parse the string as an integer.
+
+### Miscellaneous
+
+- `uint8_t jsont_current_byte(jsont_ctx_t* ctx)` — Get the last byte read.
+- `size_t jsont_current_offset(jsont_ctx_t* ctx)` — Get the current offset of the last byte read.
+- `jsont_err_t jsont_error_info(jsont_ctx_t* ctx)` — Get information on the last error.
+- `void* jsont_user_data(const jsont_ctx_t* ctx)` — Returns the value passed to `jsont_create`
+
+### Token types
+
+- `JSONT_END` — Input ended.
+- `JSONT_ERR` — Error. Retrieve details through `jsont_error_info`
+- `JSONT_OBJECT_START` — {
+- `JSONT_OBJECT_END` — }
+- `JSONT_ARRAY_START` — [
+- `JSONT_ARRAY_END` — ]
+- `JSONT_TRUE` — true
+- `JSONT_FALSE` — false
+- `JSONT_NULL` — null
+- `JSONT_NUMBER_INT` — number value without a fraction part (access through `jsont_int_value` or `jsont_float_value`)
+- `JSONT_NUMBER_FLOAT` — number value with a fraction part (access through `jsont_float_value`)
+- `JSONT_STRING` — string value (access through `jsont_data_value` or `jsont_strcpy_value`)
+- `JSONT_FIELD_NAME` — field name (access through `jsont_data_value` or `jsont_strcpy_value`)
+
+## Further reading
+
+- See `example*.c` for working sample programs.
+- See `LICENSE` for the MIT-style license under which this project is licensed.
diff --git a/example1.c b/example1.c
new file mode 100644
index 0000000..c36559c
--- /dev/null
+++ b/example1.c
@@ -0,0 +1,76 @@
+//
+// This is a simple example of running the tokenizer, outputting information
+// to stdout about what tokens we get and their values.
+//
+#include <jsont.h>
+#include <stdio.h>
+#include <string.h>
+
+static const char* _tok_name(jsont_tok_t tok);
+
+int main(int argc, const char** argv) {
+ // Create a new reusable tokenizer
+ jsont_ctx_t* S = jsont_create(0);
+
+ // Sample input
+ const char* inbuf = "{\"Ape\":123,\"Bro\":[400192,\"51\",true, false, null,"
+ " -67,\r\n\t 6.123]}";
+
+ // Reset the parser with a pointer to our sample input
+ jsont_reset(S, (const uint8_t*)inbuf, strlen(inbuf));
+
+ // Read each token
+ jsont_tok_t tok;
+ printf("Token | Value\n"
+ "-------------|----------------------------------------\n");
+ while ( (tok = jsont_next(S)) != JSONT_END && tok != JSONT_ERR) {
+ printf("%-12s |", _tok_name(tok));
+
+ // If the token has a value, also print its value
+ if (tok == JSONT_STRING || tok == JSONT_FIELD_NAME) {
+ const uint8_t* bytes = 0;
+ size_t len = jsont_data_value(S, &bytes);
+ if (len != 0)
+ printf(" '%.*s'", (int)len, (const char*)bytes);
+ } else if (tok == JSONT_NUMBER_INT) {
+ printf(" %lld", jsont_int_value(S));
+ } else if (tok == JSONT_NUMBER_FLOAT) {
+ printf(" %f", jsont_float_value(S));
+ }
+
+ printf("\n");
+ }
+
+ // If we got an error, print some useful information and exit with 1
+ if (tok == JSONT_ERR) {
+ fprintf(stderr, "Error: %s ('%c' at offset %lu)\n",
+ jsont_error_info(S),
+ (char)jsont_current_byte(S),
+ (unsigned long)jsont_current_offset(S));
+ return 1;
+ }
+
+ // Destroy our reusable tokenizer and exit
+ jsont_destroy(S);
+ return 0;
+}
+
+// Utility to get a printable name for a token
+static const char* _tok_name(jsont_tok_t tok) {
+ switch (tok) {
+ case JSONT_END: return "END";
+ case JSONT_ERR: return "ERR";
+ case JSONT_OBJECT_START: return "OBJECT_START";
+ case JSONT_OBJECT_END: return "OBJECT_END";
+ case JSONT_ARRAY_START: return "ARRAY_START";
+ case JSONT_ARRAY_END: return "ARRAY_END";
+ case JSONT_TRUE: return "TRUE";
+ case JSONT_FALSE: return "FALSE";
+ case JSONT_NULL: return "NULL";
+ case JSONT_NUMBER_INT: return "NUMBER_INT";
+ case JSONT_NUMBER_FLOAT: return "NUMBER_FLOAT";
+ case JSONT_STRING: return "STRING";
+ case JSONT_FIELD_NAME: return "FIELD_NAME";
+ default: return "?";
+ }
+}
diff --git a/example2.c b/example2.c
new file mode 100644
index 0000000..5077f85
--- /dev/null
+++ b/example2.c
@@ -0,0 +1,183 @@
+//
+// This is an example of parsing and building strict documents into C structs.
+//
+// The general approach is that each object type has a struct type and a
+// builder function. The struct type has members which represents its
+// properties. The builder function is more intresting: It takes a tokenizer
+// state and a struct instance. The builder function then reads each field
+// name from the tokenizer and calls other builder functions (this is how this
+// parser does flow control), and eventually stores the values into the struct
+// instance.
+//
+#include <jsont.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+
+// A simple array type
+typedef struct my_array {
+ size_t size;
+ size_t count;
+ void** items;
+} my_array_t;
+
+// Represents a user object
+typedef struct my_user {
+ const char* id;
+ const char* name;
+} my_user_t;
+
+// Represents a response from our imaginary service
+typedef struct my_response {
+ int64_t timestamp;
+ const char* viewer_id;
+ my_array_t users;
+} my_response_t;
+
+// A helper macro for allocating a new struct instance
+#define MY_NEW(T) (T*)malloc(sizeof(T))
+
+// Some helper macros for dealing with growing arrays
+#define MY_ARRAY_ALLOC(A, _size) do {\
+ (A).items = (void*)malloc(sizeof(void*)*_size); \
+ (A).count = 0; \
+ (A).size = _size; \
+ } while(0)
+#define MY_ARRAY_RESIZE(A, _size) do {\
+ (A).items = (void*)realloc((A).items, sizeof(void*)*_size); \
+ (A).size = _size; \
+ } while(0)
+#define MY_ARRAY_APPEND(A, item) (A).items[(A).count++] = (void*)(item)
+#define MY_NEXT_EXPECT(S, TOKTYPE) do { \
+ if ((tok = jsont_next(S)) != TOKTYPE) { \
+ printf("Error: Builder expected token " #TOKTYPE " (%d)\n", __LINE__); \
+ return false; \
+ }} while (0)
+
+// Builder function for user objects
+bool my_user_build(jsont_ctx_t* S, my_user_t* obj) {
+ jsont_tok_t tok = jsont_current(S);
+ if (tok != JSONT_OBJECT_START) return false;
+
+ // for each field
+ while ((tok = jsont_next(S)) == JSONT_FIELD_NAME) {
+ const uint8_t* fieldname = 0;
+ size_t len = jsont_data_value(S, &fieldname);
+
+ if (memcmp("id", fieldname, len) == 0) {
+ MY_NEXT_EXPECT(S, JSONT_STRING);
+ obj->id = jsont_strcpy_value(S);
+
+ } else if (memcmp("name", fieldname, len) == 0) {
+ MY_NEXT_EXPECT(S, JSONT_STRING);
+ obj->name = jsont_strcpy_value(S);
+
+ } else {
+ printf("%s: Unexpected field: \"%.*s\"\n", __FUNCTION__,
+ (int)len, (const char*)fieldname);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Builder function for response objects
+bool my_response_build(jsont_ctx_t* S, my_response_t* obj) {
+ jsont_tok_t tok = jsont_current(S);
+ if (tok != JSONT_OBJECT_START) return false;
+
+ // for each field
+ while ((tok = jsont_next(S)) == JSONT_FIELD_NAME) {
+ const uint8_t* fieldname = 0;
+ size_t len = jsont_data_value(S, &fieldname);
+
+ if (memcmp("timestamp", fieldname, len) == 0) {
+ MY_NEXT_EXPECT(S, JSONT_NUMBER_INT);
+ obj->timestamp = jsont_int_value(S);
+
+ } else if (memcmp("viewer_id", fieldname, len) == 0) {
+ MY_NEXT_EXPECT(S, JSONT_STRING);
+ obj->viewer_id = jsont_strcpy_value(S);
+
+ } else if (memcmp("users", fieldname, len) == 0) {
+ MY_NEXT_EXPECT(S, JSONT_ARRAY_START);
+ MY_ARRAY_ALLOC(obj->users, 10);
+
+ // for each user object
+ while ((tok = jsont_next(S)) == JSONT_OBJECT_START) {
+ if (obj->users.count == obj->users.size)
+ MY_ARRAY_RESIZE(obj->users, obj->users.size * 2);
+ my_user_t* user = MY_NEW(my_user_t);
+ if (!my_user_build(S, user))
+ return false;
+ MY_ARRAY_APPEND(obj->users, user);
+ }
+ } else {
+ printf("%s: Unexpected field: \"%.*s\"\n", __FUNCTION__,
+ (int)len, (const char*)fieldname);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// Our simple response parser entry point. Returns NULL on error.
+my_response_t* my_parse_response(jsont_ctx_t* S) {
+if (jsont_next(S) != JSONT_OBJECT_START) {
+ printf("Expected JSON input to start with an object.\n");
+ return 0;
+ }
+ my_response_t* rsp = MY_NEW(my_response_t);
+ if (!my_response_build(S, rsp)) {
+ free(rsp);
+ return 0;
+ }
+ return rsp;
+}
+
+int main(int argc, const char** argv) {
+ // Create a new reusable tokenizer
+ jsont_ctx_t* S = jsont_create(0);
+
+ // Sample "response" data
+ const char* inbuf = "{"
+ "\"viewer_id\": \"abc123\","
+ "\"timestamp\": 1234567890,"
+ "\"users\":["
+ "{\"name\": \"John Smith\", \"id\": \"12c39a\"},\n"
+ "{\"name\": \"John Doe\", \"id\": \"01dk2\"},\n"
+ "{\"name\": \"Kate Smith\", \"id\": \"apru1\"},\n"
+ "{\"name\": \"Rebecca Doe\",\"id\": \"aRm26\"}\n"
+ "]"
+ "}";
+
+ // Parse the sample "response" data
+ jsont_reset(S, (const uint8_t*)inbuf, strlen(inbuf));
+ my_response_t* rsp = my_parse_response(S);
+
+ // Epic success?
+ if (rsp) {
+ printf("Built response structure.\n");
+ printf("rsp->users.items[2]->name => \"%s\"\n",
+ ((my_user_t*)rsp->users.items[2])->name );
+
+ } else {
+ printf("Failed to build response structure.\n");
+ if (jsont_error_info(S) != 0) {
+ fprintf(stderr, "Error: %s ('%c' at offset %lu)\n",
+ jsont_error_info(S),
+ (char)jsont_current_byte(S),
+ (unsigned long)jsont_current_offset(S));
+ }
+ // Exit with error. Note: In a real application, you should call
+ // `jsont_destroy` on the reusable tokenizer when done with it. Here we
+ // just exit the program.
+ return 1;
+ }
+
+ // Destroy our reusable tokenizer and exit
+ jsont_destroy(S);
+ return 0;
+}
diff --git a/jsont.c b/jsont.c
new file mode 100644
index 0000000..5863c7a
--- /dev/null
+++ b/jsont.c
@@ -0,0 +1,569 @@
+// JSON Tokenizer. Copyright (c) 2012, Rasmus Andersson. All rights reserved.
+// Use of this source code is governed by a MIT-style license that can be
+// found in the LICENSE file.
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <ctype.h> // isdigit
+#include <errno.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+// Error info
+#ifndef JSONT_ERRINFO_CUSTOM
+#define jsont_err_t const char*
+#define DEF_EM(NAME, msg) static jsont_err_t JSONT_ERRINFO_##NAME = msg
+DEF_EM(STACK_SIZE, "Stack size limit exceeded");
+DEF_EM(UNEXPECTED_OBJECT_END,
+ "Unexpected end of object while not in an object");
+DEF_EM(UNEXPECTED_ARRAY_END, "Unexpected end of array while not in an array");
+DEF_EM(UNEXPECTED_COMMA, "Unexpected \",\"");
+DEF_EM(UNEXPECTED_COLON, "Unexpected \":\"");
+DEF_EM(UNEXPECTED, "Unexpected input");
+DEF_EM(UNEXPECTED_UNICODE_SEQ, "Malformed unicode encoded sequence in string");
+#undef DEF_EM
+#endif
+
+// Size of stack used for structures (in/out array and objects). This value
+// is a balance between memory size of a ctx and how many levels deep the
+// tokenizer can go.
+#define _STRUCT_TYPE_STACK_SIZE 512
+#define _VALUE_BUF_MIN_SIZE 64
+
+static const uint8_t kHexValueTable[55] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 0-0
+ -1, -1, -1, -1, -1, -1, -1,
+ 10, 11, 12, 13, 14, 15, // A-F
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1,
+ 10, 11, 12, 13, 14, 15 // a-f
+};
+
+typedef uint8_t jsont_tok_t;
+
+typedef struct jsont_ctx {
+ void* user_data;
+ const uint8_t* input_buf;
+ const uint8_t* input_buf_ptr;
+ size_t input_len;
+ const uint8_t* input_buf_value_start;
+ const uint8_t* input_buf_value_end;
+ struct {
+ uint8_t* data;
+ size_t size;
+ size_t length;
+ bool inuse;
+ } value_buf;
+ jsont_err_t error_info;
+ jsont_tok_t curr_tok;
+ size_t st_stack_size;
+ size_t st_stack_len;
+ jsont_tok_t st_stack[_STRUCT_TYPE_STACK_SIZE];
+} jsont_ctx_t;
+
+#define _JSONT_IN_SOURCE
+#include <jsont.h>
+
+unsigned long _hex_str_to_ul(const uint8_t* bytes, size_t len) {
+ unsigned long value = 0;
+ unsigned long cutoff = ULONG_MAX / 16;
+ int cutoff_digit = (int)(ULONG_MAX - cutoff * 16);
+
+ for (size_t i = 0; i != len; ++i) {
+ uint8_t b = bytes[i];
+ int digit = (b > '0'-1 && b < 'f'+1) ? kHexValueTable[b-'0'] : -1;
+ if (b == -1 || // bad digit
+ (value > cutoff) || // overflow
+ ((value == cutoff) && (digit > cutoff_digit)) ) {
+ return ULONG_MAX;
+ } else {
+ value = (value * 16) + digit;
+ }
+ }
+
+ return value;
+}
+
+jsont_ctx_t* jsont_create(void* user_data) {
+ jsont_ctx_t* ctx = (jsont_ctx_t*)calloc(1, sizeof(jsont_ctx_t));
+ ctx->user_data = user_data;
+ ctx->st_stack_size = _STRUCT_TYPE_STACK_SIZE;
+ return ctx;
+}
+
+void jsont_destroy(jsont_ctx_t* ctx) {
+ if (ctx->value_buf.data != 0) {
+ free(ctx->value_buf.data);
+ }
+ free(ctx);
+}
+
+void jsont_reset(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length) {
+ ctx->input_buf_ptr = ctx->input_buf = bytes;
+ ctx->input_len = length;
+ ctx->st_stack_len = 0;
+ ctx->curr_tok = JSONT_END;
+ ctx->input_buf_value_start = 0;
+ ctx->input_buf_value_end = 0;
+ ctx->value_buf.length = 0;
+ ctx->value_buf.inuse = false;
+ ctx->error_info = 0;
+}
+
+jsont_tok_t jsont_current(const jsont_ctx_t* ctx) {
+ return ctx->curr_tok;
+}
+
+void* jsont_user_data(const jsont_ctx_t* ctx) {
+ return ctx->user_data;
+}
+
+// Get the current/last byte read. Suitable for debugging JSONT_ERR
+uint8_t jsont_current_byte(jsont_ctx_t* ctx) {
+ return (ctx->input_buf_ptr == 0) ? 0 : *(ctx->input_buf_ptr-1);
+}
+
+size_t jsont_current_offset(jsont_ctx_t* ctx) {
+ return ctx->input_buf_ptr - ctx->input_buf;
+}
+
+jsont_err_t jsont_error_info(jsont_ctx_t* ctx) {
+ return ctx->error_info;
+}
+
+inline static bool _no_value(jsont_ctx_t* ctx) {
+ return ctx->input_buf_value_start == 0
+ || ctx->curr_tok < _JSONT_VALUES_START
+ || ctx->curr_tok > _JSONT_VALUES_END;
+}
+
+inline static size_t _input_avail(jsont_ctx_t* ctx) {
+ return ctx->input_len - (ctx->input_buf_ptr - ctx->input_buf);
+}
+
+inline static uint8_t _next_byte(jsont_ctx_t* ctx) {
+ return (_input_avail(ctx) == 0) ? 0 : *(ctx->input_buf_ptr++);
+}
+
+inline static jsont_tok_t _st_stack_top(const jsont_ctx_t* ctx) {
+ return (ctx->st_stack_len != 0) ? ctx->st_stack[ctx->st_stack_len-1]
+ : JSONT_END;
+}
+
+size_t jsont_data_value(jsont_ctx_t* ctx, const uint8_t** bytes) {
+ if (_no_value(ctx)) {
+ return 0;
+ } else {
+ if (ctx->value_buf.inuse) {
+ *bytes = ctx->value_buf.data;
+ return ctx->value_buf.length;
+ } else {
+ *bytes = ctx->input_buf_value_start;
+ return ctx->input_buf_value_end - ctx->input_buf_value_start;
+ }
+ }
+}
+
+bool jsont_data_equals(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length) {
+ if (ctx->value_buf.inuse) {
+ return (ctx->value_buf.length == length) &&
+ (memcmp((const void*)ctx->value_buf.data,
+ (const void*)bytes, length) == 0);
+ } else {
+ return (ctx->input_buf_value_end - ctx->input_buf_value_start == length) &&
+ (memcmp((const void*)ctx->input_buf_value_start,
+ (const void*)bytes, length) == 0);
+ }
+}
+
+char* jsont_strcpy_value(jsont_ctx_t* ctx) {
+ if (_no_value(ctx)) {
+ return 0;
+ } else {
+ const uint8_t* bytes = 0;
+ size_t len = jsont_data_value(ctx, &bytes);
+ char* buf = (char*)malloc(len+1);
+ if (memcpy((void*)buf, (const void*)bytes, len) != buf) {
+ return 0;
+ }
+ buf[len] = 0;
+ return buf;
+ }
+}
+
+int64_t jsont_int_value(jsont_ctx_t* ctx) {
+ if (_no_value(ctx)) {
+ return INT64_MIN;
+ }
+
+ const uint8_t* start = 0;
+ size_t len = jsont_data_value(ctx, &start);
+ if (len == 0) {
+ return INT64_MIN;
+ }
+ const uint8_t* end = start + len + 1;
+
+ bool negative;
+ uint8_t b = *start++;
+ const int base = 10;
+
+ if (b == '-') {
+ negative = true;
+ b = *start++;
+ if (start == end) {
+ errno = EINVAL;
+ return INT64_MIN;
+ }
+ } else {
+ negative = false;
+ if (b == '+') {
+ b = *start++;
+ if (start == end) {
+ errno = EINVAL;
+ return INT64_MIN;
+ }
+ }
+ }
+
+ uint64_t acc = 0;
+ int any = 0;
+ uint64_t cutoff = negative
+ ? (uint64_t)-(INT64_MIN + INT64_MAX) + INT64_MAX
+ : INT64_MAX;
+ int cutlim = cutoff % base;
+ cutoff /= base;
+ for ( ; start != end; b = *start++) {
+ if (b >= '0' && b <= '9') b -= '0'; else break;
+ if (any < 0 || acc > cutoff || (acc == cutoff && b > cutlim)) {
+ any = -1;
+ } else {
+ any = 1;
+ acc *= base;
+ acc += b;
+ }
+ }
+
+ if (any < 0) {
+ acc = negative ? INT64_MIN : INT64_MAX;
+ errno = ERANGE;
+ } else if (!any) {
+ errno = EINVAL;
+ return INT64_MIN;
+ } else if (negative) {
+ acc = -acc;
+ }
+
+ return (int64_t)acc;
+}
+
+#ifdef NAN
+ #define _JSONT_NAN NAN
+#else
+ #define _JSONT_NAN nan(0)
+#endif
+
+double jsont_float_value(jsont_ctx_t* ctx) {
+ // Note: This might cause a segfault if the input is at the end, so we cause
+ // an error if we try to read a float value while at the end of the input.
+ if (_no_value(ctx) || _input_avail(ctx) == 0) {
+ errno = EINVAL;
+ return _JSONT_NAN;
+ }
+
+ const uint8_t* bytes = 0;
+ size_t len = jsont_data_value(ctx, &bytes);
+ if (len == 0) {
+ return _JSONT_NAN;
+ }
+ return atof((const char*)bytes);
+}
+
+inline static jsont_tok_t _set_tok(jsont_ctx_t* ctx, jsont_tok_t tok) {
+ ctx->curr_tok = tok;
+
+ if (tok != JSONT_END) {
+ if (tok == JSONT_OBJECT_START) {
+ if (ctx->st_stack_len == ctx->st_stack_size) {
+ ctx->error_info = JSONT_ERRINFO_STACK_SIZE;
+ return ctx->curr_tok = JSONT_ERR; // TODO: Grow st_stack
+ }
+ ctx->st_stack[ctx->st_stack_len++] = JSONT_OBJECT_START;
+
+ } else if (tok == JSONT_OBJECT_END) {
+ if (_st_stack_top(ctx) != JSONT_OBJECT_START) {
+ ctx->error_info = JSONT_ERRINFO_UNEXPECTED_OBJECT_END;
+ return ctx->curr_tok = JSONT_ERR;
+ }
+ --ctx->st_stack_len;
+
+ } else if (tok == JSONT_ARRAY_START) {
+ if (ctx->st_stack_len == ctx->st_stack_size) {
+ ctx->error_info = JSONT_ERRINFO_STACK_SIZE;
+ return ctx->curr_tok = JSONT_ERR;
+ }
+ ctx->st_stack[ctx->st_stack_len++] = JSONT_ARRAY_START;
+
+ } else if (tok == JSONT_ARRAY_END) {
+ if (_st_stack_top(ctx) != JSONT_ARRAY_START) {
+ ctx->error_info = JSONT_ERRINFO_UNEXPECTED_ARRAY_END;
+ return ctx->curr_tok = JSONT_ERR;
+ }
+ --ctx->st_stack_len;
+ }
+ }
+
+ return tok;
+}
+inline static void _rewind_one_byte(jsont_ctx_t* ctx) {
+ --ctx->input_buf_ptr;
+}
+inline static void _rewind_bytes(jsont_ctx_t* ctx, size_t n) {
+ ctx->input_buf_ptr -= n;
+}
+inline static void _skip_bytes(jsont_ctx_t* ctx, size_t n) {
+ ctx->input_buf_ptr += n;
+}
+inline static uint8_t _read_atom(jsont_ctx_t* ctx, size_t slacklen,
+ jsont_tok_t tok) {
+ if (_input_avail(ctx) < slacklen) {
+ // rewind and wait for buffer fill
+ _rewind_one_byte(ctx);
+ return _set_tok(ctx, JSONT_END);
+ } else {
+ _skip_bytes(ctx, slacklen); // e.g. "ull" after "n" or "alse" after "f"
+ return _set_tok(ctx, tok);
+ }
+}
+inline static bool _expects_field_name(jsont_ctx_t* ctx) {
+ return ( ctx->curr_tok == JSONT_OBJECT_START
+ || ( ctx->curr_tok == _JSONT_COMMA
+ && _st_stack_top(ctx) == JSONT_OBJECT_START) );
+}
+
+static void _value_buf_append(jsont_ctx_t* ctx, const uint8_t* data, size_t len) {
+ //printf("_value_buf_append(<ctx>, %p, %zu)\n", data, len);
+ if (ctx->value_buf.size == 0) {
+ assert(ctx->value_buf.data == 0);
+ ctx->value_buf.length = len;
+ ctx->value_buf.size = len * 2;
+ if (ctx->value_buf.size < _VALUE_BUF_MIN_SIZE) {
+ ctx->value_buf.size = _VALUE_BUF_MIN_SIZE;
+ }
+ ctx->value_buf.data = (uint8_t*)malloc(ctx->value_buf.size);
+ if (len != 0) {
+ memcpy(ctx->value_buf.data, data, len);
+ }
+ } else {
+ if (ctx->value_buf.length + len > ctx->value_buf.size) {
+ size_t new_size = ctx->value_buf.size + (len * 2);
+ ctx->value_buf.data = realloc(ctx->value_buf.data, new_size);
+ assert(ctx->value_buf.data != 0);
+ ctx->value_buf.size = new_size;
+ }
+ memcpy(ctx->value_buf.data + ctx->value_buf.length, data, len);
+ ctx->value_buf.length += len;
+ }
+ ctx->value_buf.inuse = true;
+}
+
+jsont_tok_t jsont_next(jsont_ctx_t* ctx) {
+ //
+ // { } [ ] n t f "
+ // | | | |
+ // | | | +- /[^"]*/ "
+ // | | +- a l s e
+ // | +- r u e
+ // +- u l l
+ //
+ while (1) {
+ uint8_t b = _next_byte(ctx);
+ switch (b) {
+ case '{': return _set_tok(ctx, JSONT_OBJECT_START);
+ case '}': return _set_tok(ctx, JSONT_OBJECT_END);
+ case '[': return _set_tok(ctx, JSONT_ARRAY_START);
+ case ']': return _set_tok(ctx, JSONT_ARRAY_END);
+ case 'n': return _read_atom(ctx, 3, JSONT_NULL);
+ case 't': return _read_atom(ctx, 3, JSONT_TRUE);
+ case 'f': return _read_atom(ctx, 4, JSONT_FALSE);
+ case '"': {
+ ctx->input_buf_value_start = ctx->input_buf_ptr;
+ ctx->value_buf.inuse = false;
+ ctx->value_buf.length = 0;
+ uint8_t prev_b = 0;
+ while (1) {
+ b = _next_byte(ctx);
+
+ if (b == '\\') {
+ if (prev_b == '\\') {
+ // This is an actual '\'.
+ assert(ctx->value_buf.inuse == true); // should be buffering
+ _value_buf_append(ctx, ctx->input_buf_ptr-1, 1); // append "\"
+ } else {
+ // Okay, this is an escape prefix. Move to buffering value.
+ if (ctx->value_buf.inuse == 0) {
+ _value_buf_append(ctx,
+ ctx->input_buf_value_start,
+ // any data before the "\":
+ (ctx->input_buf_ptr-1 - ctx->input_buf_value_start) );
+ }
+ }
+ } else {
+ // Any byte except '\'
+
+ if (prev_b == '\\') {
+ // Currently just after an escape character
+ assert(ctx->value_buf.inuse == true); // should be buffering
+
+ // JSON specifies a few "magic" characters that have a different
+ // meaning than their value:
+ switch (b) {
+ case 'b':
+ _value_buf_append(ctx, (const uint8_t*)"\b", 1);
+ break;
+ case 'f':
+ _value_buf_append(ctx, (const uint8_t*)"\f", 1);
+ break;
+ case 'n':
+ _value_buf_append(ctx, (const uint8_t*)"\n", 1);
+ break;
+ case 'r':
+ _value_buf_append(ctx, (const uint8_t*)"\r", 1);
+ break;
+ case 't':
+ _value_buf_append(ctx, (const uint8_t*)"\t", 1);
+ break;
+ case 'u': {
+ // 4 hex digits should follow
+ if (_input_avail(ctx) < 4) {
+ _rewind_bytes(ctx,
+ ctx->input_buf_ptr - (ctx->input_buf_value_start-1));
+ return _set_tok(ctx, JSONT_END);
+ }
+ unsigned long utf16cp = _hex_str_to_ul(ctx->input_buf_ptr, 4);
+ ctx->input_buf_ptr += 4;
+ if (utf16cp == ULONG_MAX) {
+ ctx->error_info = JSONT_ERRINFO_UNEXPECTED_UNICODE_SEQ;
+ return _set_tok(ctx, JSONT_ERR);
+ }
+
+ uint32_t cp = (uint16_t)(0xffff & utf16cp);
+
+ // Is lead surrogate?
+ if (cp >= 0xd800u && cp <= 0xdbffu) {
+ // TODO: Implement pairs by reading another "\uHHHH"
+ ctx->error_info = JSONT_ERRINFO_UNEXPECTED_UNICODE_SEQ;
+ return _set_tok(ctx, JSONT_ERR);
+ }
+
+ // Append UTF-8 byte(s) representing the Unicode codepoint `cp`
+ if (cp < 0x80) {
+ uint8_t cp8 = ((uint8_t)cp);
+ _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
+ } else if (cp < 0x800) {
+ uint8_t cp8 = (uint8_t)((cp >> 6) | 0xc0);
+ _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
+ cp8 = (uint8_t)((cp & 0x3f) | 0x80);
+ _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
+ } else {
+ uint8_t cp8 = (uint8_t)((cp >> 12) | 0xe0);
+ _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
+ cp8 = (uint8_t)(((cp >> 6) & 0x3f) | 0x80);
+ _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
+ cp8 = (uint8_t)((cp & 0x3f) | 0x80);
+ _value_buf_append(ctx, (const uint8_t*)&cp8, 1);
+ }
+
+ break;
+ }
+ default: {
+ _value_buf_append(ctx, &b, 1);
+ break;
+ }
+ } // switch
+
+ } else {
+ // Previous character was NOT an escape character
+
+ if (b == '"') {
+ // Well, this marks the end of a string
+ ctx->input_buf_value_end = ctx->input_buf_ptr-1;
+ return _set_tok(ctx, _expects_field_name(ctx)
+ ? JSONT_FIELD_NAME : JSONT_STRING);
+ break;
+ } else if (b == 0) {
+ // Input buffer ends in the middle of a string
+ _rewind_bytes(ctx,
+ ctx->input_buf_ptr - (ctx->input_buf_value_start-1));
+ return _set_tok(ctx, JSONT_END);
+ } else {
+ if (ctx->value_buf.inuse) {
+ _value_buf_append(ctx, &b, 1);
+ }
+ }
+ }
+ }
+
+ prev_b = b;
+ }
+ }
+ case ',':
+ if ( ctx->curr_tok == JSONT_OBJECT_START
+ || ctx->curr_tok == JSONT_ARRAY_START
+ || ctx->curr_tok == JSONT_END
+ || ctx->curr_tok == JSONT_ERR) {
+ if (ctx->curr_tok != JSONT_ERR)
+ ctx->error_info = JSONT_ERRINFO_UNEXPECTED_COMMA;
+ return _set_tok(ctx, JSONT_ERR);
+ }
+ _set_tok(ctx, _JSONT_COMMA);
+ // read next by simply letting the outer "while" do its thing
+ break;
+
+ case ':':
+ if (ctx->curr_tok != JSONT_FIELD_NAME) {
+ ctx->error_info = JSONT_ERRINFO_UNEXPECTED_COLON;
+ return _set_tok(ctx, JSONT_ERR);
+ }
+ // let the outer "while" do its thing
+ break;
+
+ case ' ': case '\r': case '\n': case '\t':
+ // ignore whitespace and let the outer "while" do its thing
+ break;
+
+ case 0:
+ //printf("** %d\n", __LINE__);
+ return _set_tok(ctx, JSONT_END);
+
+ default:
+ if (isdigit((int)b) || b == '+' || b == '-') {
+ // We are reading a number
+ ctx->input_buf_value_start = ctx->input_buf_ptr-1;
+ //uint8_t prev_b = 0;
+ bool is_float = false;
+ while (1) {
+ b = _next_byte(ctx);
+ if (b == '.') {
+ is_float = true;
+ } else if (!isdigit((int)b)) {
+ _rewind_one_byte(ctx);
+ ctx->input_buf_value_end = ctx->input_buf_ptr;
+ return _set_tok(ctx, is_float ? JSONT_NUMBER_FLOAT
+ : JSONT_NUMBER_INT);
+ } else if (b == 0) {
+ // Input buffer ends before we know that the number-value ended
+ _rewind_bytes(ctx, ctx->input_buf_ptr
+ - (ctx->input_buf_value_start-1));
+ return _set_tok(ctx, JSONT_END);
+ }
+ }
+ }
+
+ ctx->error_info = JSONT_ERRINFO_UNEXPECTED;
+ return _set_tok(ctx, JSONT_ERR);
+ }
+ } // while (1)
+}
+
diff --git a/jsont.cc b/jsont.cc
new file mode 100644
index 0000000..09b1e45
--- /dev/null
+++ b/jsont.cc
@@ -0,0 +1,561 @@
+#include "jsont.hh"
+
+namespace jsont {
+
+static const int8_t kHexValueTable[55] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 0-0
+ -1, -1, -1, -1, -1, -1, -1,
+ 10, 11, 12, 13, 14, 15, // A-F
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1,
+ 10, 11, 12, 13, 14, 15 // a-f
+};
+
+static uint64_t _xtou64(const uint8_t* bytes, size_t len) {
+ uint64_t value = 0;
+ uint64_t cutoff = UINT64_MAX / 16;
+ int cutoff_digit = (int)(UINT64_MAX - cutoff * 16);
+
+ for (size_t i = 0; i != len; ++i) {
+ uint8_t b = bytes[i];
+ int8_t digit = (b > '0'-1 && b < 'f'+1) ? kHexValueTable[b-'0'] : -1;
+ if (b == -1 || // bad digit
+ (value > cutoff) || // overflow
+ ((value == cutoff) && (digit > cutoff_digit)) ) {
+ return UINT64_MAX;
+ } else {
+ value = (value * 16) + digit;
+ }
+ }
+
+ return value;
+}
+
+
+#ifdef NAN
+ #define _JSONT_NAN NAN
+#else
+ #define _JSONT_NAN nan(0)
+#endif
+
+
+const char* token_name(jsont::Token tok) {
+ switch (tok) {
+ case End: return "End";
+ case ObjectStart: return "ObjectStart";
+ case ObjectEnd: return "ObjectEnd";
+ case ArrayStart: return "ArrayStart";
+ case ArrayEnd: return "ArrayEnd";
+ case True: return "True";
+ case False: return "False";
+ case Null: return "Null";
+ case Integer: return "Integer";
+ case Float: return "Float";
+ case String: return "String";
+ case FieldName: return "FieldName";
+ default: return "?";
+ }
+}
+
+
+class TokenizerInternal {
+public:
+ inline static const uint8_t* currentInput(const Tokenizer& self) {
+ return self._input.bytes + self._input.offset;
+ }
+
+ inline static const Token& readAtom(Tokenizer& self, const char* str,
+ size_t len, const Token& token) {
+ if (self.availableInput() < len) {
+ return self.setError(Tokenizer::PrematureEndOfInput);
+ } else if (memcmp(currentInput(self), str, len) != 0) {
+ return self.setError(Tokenizer::InvalidByte);
+ } else {
+ self._input.offset += len;
+ return self.setToken(token);
+ }
+ }
+};
+
+
+Tokenizer::~Tokenizer() {}
+
+
+void Tokenizer::reset(const char* bytes, size_t length, TextEncoding encoding) {
+ assert(encoding == UTF8TextEncoding); // only supported encoding
+ _input.bytes = (const uint8_t*)bytes;
+ _input.length = length;
+ _input.offset = 0;
+ _error.code = UnspecifiedError;
+ // Advance to first token
+ next();
+}
+
+
+const char* Tokenizer::errorMessage() const {
+ switch (_error.code) {
+ case UnexpectedComma:
+ return "Unexpected comma";
+ case UnexpectedTrailingComma:
+ return "Unexpected trailing comma";
+ case InvalidByte:
+ return "Invalid input byte";
+ case PrematureEndOfInput:
+ return "Premature end of input";
+ case MalformedUnicodeEscapeSequence:
+ return "Malformed Unicode escape sequence";
+ case MalformedNumberLiteral:
+ return "Malformed number literal";
+ case UnterminatedString:
+ return "Unterminated string";
+ case SyntaxError:
+ return "Illegal JSON (syntax error)";
+ default:
+ return "Unspecified error";
+ }
+}
+
+
+size_t Tokenizer::dataValue(const char const** bytes) const {
+ if (!hasValue()) { return 0; }
+ if (_value.buffered) {
+ *bytes = (const char const*)_value.buffer.data();
+ return _value.buffer.size();
+ } else {
+ *bytes = (const char const*)(_input.bytes + _value.offset);
+ return _value.length;
+ }
+}
+
+
+double Tokenizer::floatValue() const {
+ if (!hasValue()) {
+ return _token == jsont::True ? 1.0 : 0.0;
+ }
+
+ const char* bytes;
+
+ if (_value.buffered) {
+ // edge-case since only happens with string values using escape sequences
+ bytes = _value.buffer.c_str();
+ } else {
+ bytes = (const char*)_input.bytes + _value.offset;
+ if (availableInput() == 0) {
+ // In this case where the data lies at the edge of the buffer, we can't pass
+ // it directly to atof, since there will be no sentinel byte. We are fine
+ // with a copy, since this is an edge case (only happens either for broken
+ // JSON or when the whole document is just a number).
+ char* buf[128];
+ if (_value.length > 127) {
+ // We are unable to interpret such a large literal in this edge-case
+ return _JSONT_NAN;
+ }
+ memcpy((void*)buf, (const void*)bytes, _value.length);
+ buf[_value.length] = '\0';
+ return strtod((const char*)buf, (char**)0);
+ }
+ }
+
+ return strtod(bytes, (char**)0);
+}
+
+
+int64_t Tokenizer::intValue() const {
+ if (!hasValue()) {
+ return _token == jsont::True ? 1LL : 0LL;
+ }
+
+ const char* bytes;
+
+ if (_value.buffered) {
+ // edge-case since only happens with string values using escape sequences
+ bytes = _value.buffer.c_str();
+ } else {
+ bytes = (const char*)_input.bytes + _value.offset;
+ if (availableInput() == 0) {
+ // In this case where the data lies at the edge of the buffer, we can't pass
+ // it directly to atof, since there will be no sentinel byte. We are fine
+ // with a copy, since this is an edge case (only happens either for broken
+ // JSON or when the whole document is just a number).
+ char* buf[21];
+ if (_value.length > 20) {
+ // We are unable to interpret such a large literal in this edge-case
+ return 0;
+ }
+ memcpy((void*)buf, (const void*)bytes, _value.length);
+ buf[_value.length] = '\0';
+ return strtoll((const char*)buf, (char**)0, 10);
+ }
+ }
+
+ return strtoll(bytes, (char**)0, 10);
+}
+
+
+const Token& Tokenizer::next() {
+ //
+ // { } [ ] n t f "
+ // | | | |
+ // | | | +- /[^"]*/ "
+ // | | +- a l s e
+ // | +- r u e
+ // +- u l l
+ //
+ while (!endOfInput()) {
+ uint8_t b = _input.bytes[_input.offset++];
+ switch (b) {
+ case '{': return setToken(ObjectStart);
+ case '}': {
+ if (_token == _Comma) { return setError(UnexpectedTrailingComma); }
+ return setToken(ObjectEnd);
+ }
+
+ case '[': return setToken(ArrayStart);
+ case ']': {
+ if (_token == _Comma) { return setError(UnexpectedTrailingComma); }
+ return setToken(ArrayEnd);
+ }
+
+ case 'n':
+ return TokenizerInternal::readAtom(*this, "ull", 3, jsont::Null);
+ case 't':
+ return TokenizerInternal::readAtom(*this, "rue", 3, jsont::True);
+ case 'f':
+ return TokenizerInternal::readAtom(*this, "alse", 4, jsont::False);
+
+ case ' ': case '\t': case '\r': case '\n': // IETF RFC4627
+ // ignore whitespace and let the outer "while" do its thing
+ break;
+
+ case 0:
+ return setError(InvalidByte);
+
+ // when we read a value, we don't produce a token until we either reach
+ // end of input, a colon (then the value is a field name), a comma, or an
+ // array or object terminator.
+
+ case '"': {
+ _value.beginAtOffset(_input.offset);
+
+ while (!endOfInput()) {
+ b = _input.bytes[_input.offset++];
+ assert(_input.offset < _input.length);
+
+ switch (b) {
+
+ case '\\': {
+ // We must go buffered since the input segment != value
+ if (!_value.buffered) {
+ _value.buffered = true;
+ _value.buffer.assign(
+ (const char*)(_input.bytes+_value.offset),
+ _input.offset - _value.offset - 1
+ );
+ }
+
+ if (endOfInput()) {
+ return setError(PrematureEndOfInput);
+ }
+
+ b = _input.bytes[_input.offset++];
+ switch (b) {
+ case 'b': _value.buffer.append(1, '\x08'); break;
+ case 'f': _value.buffer.append(1, '\x0C'); break;
+ case 'n': _value.buffer.append(1, '\x0A'); break;
+ case 'r': _value.buffer.append(1, '\x0D'); break;
+ case 't': _value.buffer.append(1, '\x09'); break;
+ case 'u': {
+ // \uxxxx
+ if (availableInput() < 4) {
+ return setError(PrematureEndOfInput);
+ }
+
+ uint64_t utf16cp =
+ _xtou64(TokenizerInternal::currentInput(*this), 4);
+ _input.offset += 4;
+
+ if (utf16cp > 0xffff) {
+ return setError(MalformedUnicodeEscapeSequence);
+ }
+
+ uint16_t cp = (uint16_t)(0xffff & utf16cp);
+
+ // Append UTF-8 byte(s) representing the Unicode codepoint cp
+ if (cp < 0x80) {
+ // U+0000 - U+007F
+ uint8_t cp8 = ((uint8_t)cp);
+ _value.buffer.append(1, (char)cp8);
+ } else if (cp < 0x800) {
+ // U+0080 - U+07FF
+ uint8_t cp8 = (uint8_t)((cp >> 6) | 0xc0);
+ _value.buffer.append(1, (char)cp8);
+ cp8 = (uint8_t)((cp & 0x3f) | 0x80);
+ _value.buffer.append(1, (char)cp8);
+ } else if (cp >= 0xD800u && cp <= 0xDFFFu) {
+ // UTF-16 Surrogate pairs -- according to the UTF-8
+ // definition (RFC 3629) the high and low surrogate halves
+ // used by UTF-16 (U+D800 through U+DFFF) are not legal
+ // Unicode values, and the UTF-8 encoding of them is an
+ // invalid byte sequence. Instead of throwing an error, we
+ // substitute this character with the replacement character
+ // U+FFFD (UTF-8: EF,BF,BD).
+ _value.buffer.append("\xEF\xBF\xBD");
+ //
+ } else {
+ // U+0800 - U+FFFF
+ uint8_t cp8 = (uint8_t)((cp >> 12) | 0xe0);
+ _value.buffer.append(1, (char)cp8);
+ cp8 = (uint8_t)(((cp >> 6) & 0x3f) | 0x80);
+ _value.buffer.append(1, (char)cp8);
+ cp8 = (uint8_t)((cp & 0x3f) | 0x80);
+ _value.buffer.append(1, (char)cp8);
+ }
+
+ break;
+ }
+ default:
+ _value.buffer.append(1, (char)b); break;
+ }
+ break;
+ }
+
+ case '"':
+ goto after_initial_read_b;
+
+ case 0:
+ return setError(InvalidByte);
+
+ default: {
+ if (_value.buffered) {
+ // TODO: Make this efficient by appending chunks between
+ // boundaries instead of appending per-byte
+ _value.buffer.append(1, (char)b);
+ }
+ break;
+ }
+ } // switch(b)
+ } // while (!endOfInput())
+
+ after_initial_read_b:
+ if (b != '"') {
+ return setError(UnterminatedString);
+ }
+
+ if (!_value.buffered) {
+ _value.length = _input.offset - _value.offset - 1;
+ }
+
+ // is this a field name?
+ while (!endOfInput()) {
+ b = _input.bytes[_input.offset++];
+ switch (b) {
+ case ' ': case '\t': case '\r': case '\n': break;
+ case ':': return setToken(FieldName);
+ case ',': goto string_read_return_string;
+ case ']': case '}': {
+ --_input.offset; // rewind
+ goto string_read_return_string;
+ }
+ case 0: return setError(InvalidByte);
+ default: {
+ // Expected a comma or a colon
+ return setError(SyntaxError);
+ }
+ }
+ }
+
+ string_read_return_string:
+ return setToken(jsont::String);
+ }
+
+ case ',': {
+ if (_token == ObjectStart || _token == ArrayStart || _token == _Comma) {
+ return setError(UnexpectedComma);
+ }
+ _token = _Comma;
+ break;
+ }
+
+ default: {
+ if (isdigit((int)b) || b == '+' || b == '-') {
+ // We are reading a number
+ _value.beginAtOffset(_input.offset-1);
+ Token token = jsont::Integer;
+
+ while (!endOfInput()) {
+ b = _input.bytes[_input.offset++];
+ switch (b) {
+ case '0'...'9': break;
+ case '.': token = jsont::Float; break;
+ case 'E': case 'e': case '-': case '+': {
+ if (token != jsont::Float) {
+ return setError(MalformedNumberLiteral);
+ }
+ break;
+ }
+ default: {
+ if ( (_input.offset - _value.offset == 1) &&
+ (_input.bytes[_value.offset] == '-' ||
+ _input.bytes[_value.offset] == '+') ) {
+ return setError(MalformedNumberLiteral);
+ }
+
+ // rewind the byte that terminated this number literal
+ --_input.offset;
+
+ _value.length = _input.offset - _value.offset - 1;
+ return setToken(token);
+ }
+ }
+ }
+ return setToken(End);
+ } else {
+ return setError(InvalidByte);
+ }
+ }
+ }
+ }
+
+ return setToken(End);
+}
+
+
+enum {
+ kUTF8ByteVerbatim = 0,
+ kUTF8ByteEncode1, // "\u000x"
+ kUTF8ByteEncode2, // "\u00xx"
+};
+#define V kUTF8ByteVerbatim
+#define E1 kUTF8ByteEncode1
+#define E2 kUTF8ByteEncode2
+static const uint8_t kUTF8ByteTable[256] = {
+ E1, E1, E1, E1, E1, E1, E1, E1, 'b', 't', 'n', E1, 'f', 'r', E1, E1, E2, E2,
+ E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, V, V, '"', V, V, V, V,
+ V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, '\\', V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, V, V, V, V, V, V, V, V, V, V, E2, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+ V, V, V, V, V, V, V, V, V, V
+};
+#undef V
+#undef E1
+#undef E2
+
+// #ifndef __has_feature
+// #define __has_feature(x) 0
+// #endif
+// #if defined(__cplusplus) && __has_feature(cxx_static_assert)
+// #define JSONT_CONST_ASSERT(expr, error_msg) static_assert((expr), (error_msg))
+// #elif __has_feature(c_static_assert)
+// #define JSONT_CONST_ASSERT(expr, error_msg) _Static_assert((expr), (error_msg))
+// #else
+// #define JSONT_CONST_ASSERT(expr, error_msg) ((void)0)
+// #endif
+
+Builder& Builder::appendString(const uint8_t* v, size_t length, TextEncoding encoding) {
+ reserve(length + 2);
+ _buf[_size++] = '"';
+
+ assert(encoding == UTF8TextEncoding /* Currently only UTF-8 is supported */);
+
+ const uint8_t* end = v+length;
+ while (v != end) {
+ uint8_t s = kUTF8ByteTable[*v];
+ switch (s) {
+ case kUTF8ByteVerbatim:
+ _buf[_size++] = *v;
+ break;
+ case kUTF8ByteEncode1: {
+ assert(*v < 16);
+ size_t remainingSize = end-v+1+5; // five additional bytes needed
+ reserve(remainingSize);
+ _buf[_size] = '\\';
+ _buf[++_size] = 'u';
+ _buf[++_size] = '0';
+ _buf[++_size] = '0';
+ _buf[++_size] = '0';
+ _buf[++_size] = *v + (*v > 10 ? 55 : 48); // A-F : 0-9
+ ++_size;
+ assert(_size <= _capacity);
+ break;
+ }
+ case kUTF8ByteEncode2: {
+ // Note: *v is guaranteed to be within the set [16,32),127. This is
+ // an affect of the kUTF8ByteTable lookup table and this code needs to
+ // be revised if the lookup table adds or removes any kUTF8ByteEncode.
+ assert((*v > 15 && *v < 32) || *v == 127);
+ size_t remainingSize = end-v+1+5; // five additional bytes needed
+ reserve(remainingSize);
+ _buf[_size] = '\\';
+ _buf[++_size] = 'u';
+ _buf[++_size] = '0';
+ _buf[++_size] = '0';
+ uint8_t b1 = (*v & 0xf0) / 16;
+ //uint8_t b1 = (*v & 0xf0) >> 4; // slightly faster but LE-specific
+ uint8_t b2 = *v & 0x0f;
+ _buf[++_size] = b1 + (b1 > 10 ? 55 : 48); // A-F : 0-9
+ _buf[++_size] = b2 + (b2 > 10 ? 55 : 48); // A-F : 0-9
+ ++_size;
+ assert(_size <= _capacity);
+ break;
+ }
+ default:
+ // reverse solidus escape
+ size_t remainingSize = end-v+1+1; // one additional byte needed
+ reserve(remainingSize);
+ _buf[_size++] = '\\';
+ _buf[_size++] = s;
+ assert(_size <= _capacity);
+ break;
+ }
+
+ ++v;
+ }
+
+ _buf[_size++] = '"';
+ assert(_size <= _capacity);
+ return *this;
+}
+
+#if JSONT_CXX_RVALUE_REFS
+ // Move constructor and assignment operator
+ Builder::Builder(Builder&& other)
+ : _buf(other._buf)
+ , _capacity(other._capacity)
+ , _size(other._size)
+ , _state(other._state) {
+ other._buf = 0;
+ }
+
+ Builder& Builder::operator=(Builder&& other) {
+ _buf = other._buf; other._buf = 0;
+ _capacity = other._capacity;
+ _size = other._size;
+ _state = other._state;
+ return *this;
+ }
+#endif
+
+Builder::Builder(const Builder& other)
+ : _buf(0)
+ , _capacity(other._capacity)
+ , _size(other._size)
+ , _state(other._state) {
+ _buf = (char*)malloc(_capacity);
+ memcpy((void*)_buf, (const void*)other._buf, _size);
+}
+
+Builder& Builder::operator=(const Builder& other) {
+ _capacity = other._capacity;
+ _size = other._size;
+ _state = other._state;
+ _buf = (char*)malloc(_capacity);
+ memcpy((void*)_buf, (const void*)other._buf, _size);
+ return *this;
+}
+
+} // namespace jsont
diff --git a/jsont.h b/jsont.h
new file mode 100644
index 0000000..22cd043
--- /dev/null
+++ b/jsont.h
@@ -0,0 +1,114 @@
+// JSON Tokenizer. Copyright (c) 2012, Rasmus Andersson. All rights reserved.
+// Use of this source code is governed by a MIT-style license that can be
+// found in the LICENSE file.
+#ifndef JSONT_INCLUDED
+#define JSONT_INCLUDED
+
+#include <stdint.h> // uint8_t, int64_t
+#include <stdlib.h> // size_t
+#include <string.h> // strlen
+#include <stdbool.h> // bool
+
+#ifndef _JSONT_IN_SOURCE
+typedef struct jsont_ctx jsont_ctx_t;
+typedef uint8_t jsont_tok_t;
+#endif
+
+#ifndef JSONT_ERRINFO_CUSTOM
+#define jsont_err_t const char*
+#endif
+
+// Token types
+enum {
+ JSONT_END = 0, // Input ended
+ JSONT_ERR, // Error
+
+ JSONT_OBJECT_START, // {
+ JSONT_OBJECT_END, // }
+
+ JSONT_ARRAY_START, // [
+ JSONT_ARRAY_END, // ]
+
+ JSONT_TRUE, // true
+ JSONT_FALSE, // false
+ JSONT_NULL, // null
+
+ _JSONT_VALUES_START,
+ JSONT_NUMBER_INT, // number value without a fraction part
+ JSONT_NUMBER_FLOAT, // number value with a fraction part
+ JSONT_STRING, // string value
+ JSONT_FIELD_NAME, // field name
+ _JSONT_VALUES_END,
+
+ _JSONT_COMMA,
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Create a new JSON tokenizer context. `user_data` can be anything and is
+// accessible through `jsont_user_data`.
+jsont_ctx_t* jsont_create(void* user_data);
+
+// Destroy a JSON tokenizer context. This will free any internal data, except
+// from the input buffer.
+void jsont_destroy(jsont_ctx_t* ctx);
+
+// Reset the tokenizer to parse the data pointed to by `bytes`. The tokenizer
+// does NOT take ownership of `bytes`. This function can be used to recycle a
+// tokenizer context, minimizing memory reallocation.
+void jsont_reset(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length);
+
+// Read and return the next token. See `jsont_tok_t` enum for a list of
+// possible return values and their meaning.
+jsont_tok_t jsont_next(jsont_ctx_t* ctx);
+
+// Returns the current token (last token read by `jsont_next`).
+jsont_tok_t jsont_current(const jsont_ctx_t* ctx);
+
+// Returns a slice of the input which represents the current value, or nothing
+// (returns 0) if the current token has no value (e.g. start of an object).
+size_t jsont_data_value(jsont_ctx_t* ctx, const uint8_t** bytes);
+
+// Returns true if the current data value is equal to `bytes` of `length`
+bool jsont_data_equals(jsont_ctx_t* ctx, const uint8_t* bytes, size_t length);
+
+// Returns true if the current data value is equal to c string `str`
+static inline bool jsont_str_equals(jsont_ctx_t* ctx, const char* str) {
+ return jsont_data_equals(ctx, (const uint8_t*)str, strlen(str));
+}
+
+// Retrieve a newly allocated c-string. Similar to `jsont_data_value` but
+// returns a newly allocated copy of the current value as a C string
+// (terminated by a null byte). The calling code is responsible for calling
+// `free()` on the returned value.
+char* jsont_strcpy_value(jsont_ctx_t* ctx);
+
+// Returns the current integer value.If the number is too large or too small,
+// this function sets errno and returns INT64_MAX or INT64_MIN.
+int64_t jsont_int_value(jsont_ctx_t* ctx);
+
+// Returns the current floating-point number value. Sets errno and returns a
+// value that isnan(N)==true on error.
+double jsont_float_value(jsont_ctx_t* ctx);
+
+// Get the last byte read. Suitable for debugging JSONT_ERR.
+uint8_t jsont_current_byte(jsont_ctx_t* ctx);
+
+// Get the current offset of the last byte read.
+size_t jsont_current_offset(jsont_ctx_t* ctx);
+
+// Get information on the last error (by default a printable text message).
+// Returns NULL if no error has occured since a call to `jsont_reset`.
+jsont_err_t jsont_error_info(jsont_ctx_t* ctx);
+
+// Returns the value passed to `jsont_create`.
+void* jsont_user_data(const jsont_ctx_t* ctx);
+
+// ----------------- C++ -----------------
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // JSONT_INCLUDED
diff --git a/jsont.hh b/jsont.hh
new file mode 100644
index 0000000..3e86cad
--- /dev/null
+++ b/jsont.hh
@@ -0,0 +1,420 @@
+// JSON Tokenizer and builder. Copyright (c) 2012, Rasmus Andersson. All rights
+// reserved. Use of this source code is governed by a MIT-style license that can
+// be found in the LICENSE file.
+#ifndef JSONT_CXX_INCLUDED
+#define JSONT_CXX_INCLUDED
+
+#include <stdint.h> // uint8_t, int64_t
+#include <stdlib.h> // size_t
+#include <string.h> // strlen
+#include <stdbool.h> // bool
+#include <math.h>
+#include <assert.h>
+#include <string>
+#include <stdexcept>
+
+// Can haz rvalue references with move semantics?
+#if (defined(_MSC_VER) && _MSC_VER >= 1600) || \
+ (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__) || \
+ (defined(__has_feature) && __has_feature(cxx_rvalue_references))
+ #define JSONT_CXX_RVALUE_REFS 1
+#else
+ #define JSONT_CXX_RVALUE_REFS 0
+#endif
+
+namespace jsont {
+
+// Tokens
+typedef enum {
+ End = 0, // Input ended
+ ObjectStart, // {
+ ObjectEnd, // }
+ ArrayStart, // [
+ ArrayEnd, // ]
+ True, // true
+ False, // false
+ Null, // null
+ Integer, // number value without a fraction part
+ Float, // number value with a fraction part
+ String, // string value
+ FieldName, // field name
+ Error, // An error occured (see `error()` for details)
+ _Comma,
+} Token;
+
+// String encoding
+typedef enum {
+ UTF8TextEncoding = 0,
+} TextEncoding;
+
+// Name of `token`
+const char* token_name(jsont::Token token);
+
+class TokenizerInternal;
+
+// Reads a sequence of bytes and produces tokens and values while doing so
+class Tokenizer {
+public:
+ Tokenizer(const char* bytes, size_t length, TextEncoding encoding);
+ ~Tokenizer();
+
+ // Read next token
+ const Token& next();
+
+ // Access current token
+ const Token& current() const;
+
+ // Reset the tokenizer, making it possible to reuse this parser so to avoid
+ // unnecessary memory allocation and deallocation.
+ void reset(const char* bytes, size_t length, TextEncoding encoding);
+
+ // True if the current token has a value
+ bool hasValue() const;
+
+ // Returns a slice of the input which represents the current value, or nothing
+ // (returns 0) if the current token has no value (e.g. start of an object).
+ size_t dataValue(const char const** bytes) const;
+
+ // Returns a *copy* of the current string value.
+ std::string stringValue() const;
+
+ // Returns the current value as a double-precision floating-point number.
+ double floatValue() const;
+
+ // Returns the current value as a signed 64-bit integer.
+ int64_t intValue() const;
+
+ // Returns the current value as a boolean
+ bool boolValue() const;
+
+ // Error codes
+ typedef enum {
+ UnspecifiedError = 0,
+ UnexpectedComma,
+ UnexpectedTrailingComma,
+ InvalidByte,
+ PrematureEndOfInput,
+ MalformedUnicodeEscapeSequence,
+ MalformedNumberLiteral,
+ UnterminatedString,
+ SyntaxError,
+ } ErrorCode;
+
+ // Returns the error code of the last error
+ ErrorCode error() const;
+
+ // Returns a human-readable message for the last error. Never returns NULL.
+ const char* errorMessage() const;
+
+ // The byte offset into input where the tokenizer is currently looking. In the
+ // event of an error, this will point to the source of the error.
+ size_t inputOffset() const;
+
+ // Total number of input bytes
+ size_t inputSize() const;
+
+ // A pointer to the input data as passed to `reset` or the constructor.
+ const char* inputBytes() const;
+
+ friend class TokenizerInternal;
+private:
+ size_t availableInput() const;
+ size_t endOfInput() const;
+ const Token& setToken(Token t);
+ const Token& setError(ErrorCode error);
+
+ struct {
+ const uint8_t* bytes;
+ size_t length;
+ size_t offset;
+ } _input;
+ struct Value {
+ Value() : offset(0), length(0), buffered(false) {}
+ void beginAtOffset(size_t z);
+ size_t offset; // into _input.bytes
+ size_t length;
+ std::string buffer;
+ bool buffered; // if true, contents lives in buffer
+ } _value;
+ Token _token;
+ struct {
+ ErrorCode code;
+ } _error;
+};
+
+
+// Helps in building JSON, providing a final sequential byte buffer
+class Builder {
+public:
+ Builder() : _buf(0), _capacity(0), _size(0), _state(NeutralState) {}
+ ~Builder() { if (_buf) { free(_buf); _buf = 0; } }
+ Builder(const Builder& other);
+ Builder& operator=(const Builder& other);
+#if JSONT_CXX_RVALUE_REFS
+ Builder(Builder&& other);
+ Builder& operator=(Builder&& other);
+#endif
+
+ Builder& startObject();
+ Builder& endObject();
+ Builder& startArray();
+ Builder& endArray();
+ Builder& fieldName(const char* v, size_t length, TextEncoding e=UTF8TextEncoding);
+ Builder& fieldName(const std::string& name, TextEncoding enc=UTF8TextEncoding);
+ Builder& value(const char* v, size_t length, TextEncoding e=UTF8TextEncoding);
+ Builder& value(const char* v);
+ Builder& value(const std::string& v);
+ Builder& value(double v);
+ Builder& value(int64_t v);
+ Builder& value(int v);
+ Builder& value(unsigned int v);
+ Builder& value(long v);
+ Builder& value(bool v);
+ Builder& nullValue();
+
+ size_t size() const;
+ const char* bytes() const;
+ std::string toString() const;
+ const char* seizeBytes(size_t& size_out);
+ const void reset();
+
+private:
+ size_t available() const;
+ void reserve(size_t size);
+ void prefix();
+ Builder& appendString(const uint8_t* v, size_t length, TextEncoding enc);
+ Builder& appendChar(char byte);
+
+ char* _buf;
+ size_t _capacity;
+ size_t _size;
+ enum {
+ NeutralState = 0,
+ AfterFieldName,
+ AfterValue,
+ AfterObjectStart,
+ AfterArrayStart,
+ } _state;
+};
+
+
+// Convenience function
+inline Builder build() { return Builder(); }
+
+
+// ------------------- internal ---------------------
+
+inline Tokenizer::Tokenizer(const char* bytes, size_t length,
+ TextEncoding encoding) : _token(End) {
+ reset(bytes, length, encoding);
+}
+
+inline const Token& Tokenizer::current() const { return _token; }
+
+inline bool Tokenizer::hasValue() const {
+ return _token >= Integer && _token <= FieldName;
+}
+
+inline std::string Tokenizer::stringValue() const {
+ const char* bytes;
+ size_t size = dataValue(&bytes);
+ return std::string(bytes, size);
+}
+
+inline bool Tokenizer::boolValue() const {
+ return _token == True;
+}
+
+inline size_t Tokenizer::availableInput() const {
+ return _input.length - _input.offset;
+}
+inline size_t Tokenizer::endOfInput() const {
+ return _input.offset == _input.length;
+}
+inline const Token& Tokenizer::setToken(Token t) {
+ return _token = t;
+}
+inline const Token& Tokenizer::setError(Tokenizer::ErrorCode error) {
+ _error.code = error;
+ return _token = Error;
+}
+inline size_t Tokenizer::inputOffset() const {
+ return _input.offset;
+}
+inline size_t Tokenizer::inputSize() const {
+ return _input.length;
+}
+inline const char* Tokenizer::inputBytes() const {
+ return (const char*)_input.bytes;
+}
+
+inline void Tokenizer::Value::beginAtOffset(size_t z) {
+ offset = z;
+ length = 0;
+ buffered = false;
+}
+
+inline Tokenizer::ErrorCode Tokenizer::error() const {
+ return _error.code;
+}
+
+
+inline Builder& Builder::startObject() {
+ prefix();
+ _state = AfterObjectStart;
+ return appendChar('{');
+}
+
+inline Builder& Builder::endObject() {
+ _state = AfterValue;
+ return appendChar('}');
+}
+
+inline Builder& Builder::startArray() {
+ prefix();
+ _state = AfterArrayStart;
+ return appendChar('[');
+}
+
+inline Builder& Builder::endArray() {
+ _state = AfterValue;
+ return appendChar(']');
+}
+
+inline Builder& Builder::fieldName(const std::string& name, TextEncoding enc) {
+ return fieldName(name.data(), name.size(), enc);
+}
+
+inline Builder& Builder::fieldName(const char* v, size_t length,
+ TextEncoding enc) {
+ prefix();
+ _state = AfterFieldName;
+ return appendString((const uint8_t*)v, length, enc);
+}
+
+inline Builder& Builder::value(const char* v, size_t length, TextEncoding enc) {
+ prefix();
+ _state = AfterValue;
+ return appendString((const uint8_t*)v, length, enc);
+}
+
+inline Builder& Builder::value(const char* v) {
+ return value(v, strlen(v));
+}
+
+inline Builder& Builder::value(const std::string& v) {
+ return value(v.data(), v.size());
+}
+
+inline Builder& Builder::value(double v) {
+ prefix();
+ reserve(256);
+ int z = snprintf(_buf+_size, 256, "%g", v);
+ assert(z < 256);
+ _size += z;
+ _state = AfterValue;
+ return *this;
+}
+
+inline Builder& Builder::value(int64_t v) {
+ prefix();
+ reserve(21);
+ int z = snprintf(_buf+_size, 21, "%lld", v);
+ assert(z < 21);
+ _size += z;
+ _state = AfterValue;
+ return *this;
+}
+
+inline Builder& Builder::value(int v) { return value((int64_t)v); }
+inline Builder& Builder::value(unsigned int v) { return value((int64_t)v); }
+inline Builder& Builder::value(long v) { return value((int64_t)v); }
+
+inline Builder& Builder::value(bool v) {
+ prefix();
+ if (v) {
+ reserve(4);
+ _buf[_size] = 't';
+ _buf[++_size] = 'r';
+ _buf[++_size] = 'u';
+ _buf[++_size] = 'e';
+ ++_size;
+ } else {
+ reserve(5);
+ _buf[_size] = 'f';
+ _buf[++_size] = 'a';
+ _buf[++_size] = 'l';
+ _buf[++_size] = 's';
+ _buf[++_size] = 'e';
+ ++_size;
+ }
+ _state = AfterValue;
+ return *this;
+}
+
+inline Builder& Builder::nullValue() {
+ prefix();
+ reserve(4);
+ _buf[_size] = 'n';
+ _buf[++_size] = 'u';
+ _buf[++_size] = 'l';
+ _buf[++_size] = 'l';
+ ++_size;
+ _state = AfterValue;
+ return *this;
+}
+
+inline size_t Builder::size() const { return _size; }
+inline const char* Builder::bytes() const { return _buf; }
+inline std::string Builder::toString() const {
+ return std::string(bytes(), size());
+}
+inline const char* Builder::seizeBytes(size_t& size_out) {
+ const char* buf = _buf;
+ size_out = _size;
+ _buf = 0;
+ _capacity = 0;
+ reset();
+ return buf;
+}
+inline const void Builder::reset() {
+ _size = 0;
+ _state = NeutralState;
+}
+
+inline size_t Builder::available() const {
+ return _capacity - _size;
+}
+
+inline void Builder::reserve(size_t size) {
+ if (available() < size) {
+ #if 0
+ // exact allocation for debugging purposes
+ printf("DEBUG Builder::reserve: size=%zu available=%zu grow_by=%zu\n",
+ size, available(), (size - available()) );
+ _capacity += size - available();
+ #else
+ _capacity += size - available();
+ _capacity = (_capacity < 64) ? 64 : (_capacity * 1.5);
+ #endif
+ _buf = (char*)realloc((void*)_buf, _capacity);
+ }
+}
+
+inline void Builder::prefix() {
+ if (_state == AfterFieldName) {
+ appendChar(':');
+ } else if (_state == AfterValue) {
+ appendChar(',');
+ }
+}
+
+inline Builder& Builder::appendChar(char byte) {
+ reserve(1);
+ _buf[_size++] = byte;
+ return *this;
+}
+
+}
+
+#endif // JSONT_CXX_INCLUDED
diff --git a/test/test_tokenizer.c b/test/test_tokenizer.c
new file mode 100644
index 0000000..f994c85
--- /dev/null
+++ b/test/test_tokenizer.c
@@ -0,0 +1,180 @@
+#include <jsont.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+
+#define JSONT_ASSERT_FIELD_NAME(fieldName) do { \
+ assert(jsont_current(S) == JSONT_FIELD_NAME); \
+ assert(jsont_data_equals(S, (const uint8_t*)fieldName, \
+ strlen(fieldName)) == true); \
+} while(0)
+
+int main(int argc, const char** argv) {
+ // Create a new reusable tokenizer
+ jsont_ctx_t* S = jsont_create(0);
+
+ const char* inbuf = "{ "
+ "\"\\\"fo\\\"o\": \"Foo\"," // "\"fo\"o": "Foo"
+ "\"1\" : \"\\u2192\","
+ "\"n\":1234,"
+ "\"x\" : \t 12.34,"
+ "\"overflow\" : \t 9999999999999999999999999999999999,"
+ "\"b\\/a\\/r\":["
+ "null,"
+ "true,"
+ "false,"
+ "{"
+ "\"x\":12.3"
+ "},"
+ "\n123,"
+ "\"456\","
+ "\"a\\\"b\\\"\","
+ "\"a\\u0000b\","
+ "\"a\\bb\","
+ "\"a\\fb\","
+ "\"a\\nb\","
+ "\"a\\rb\","
+ "\"a\\tb\","
+ "\"\","
+ "\" \""
+ "]"
+ "}";
+
+ jsont_reset(S, (const uint8_t*)inbuf, strlen(inbuf));
+ jsont_tok_t tok;
+
+ tok = jsont_next(S);
+ assert(tok == JSONT_OBJECT_START);
+ assert(jsont_current(S) == JSONT_OBJECT_START);
+
+ tok = jsont_next(S);
+ assert(tok == JSONT_FIELD_NAME);
+
+ // Expect current data to be the bytes '"fo"o'
+ const char* expectedData = "\"fo\"o";
+ const uint8_t* bytes;
+ size_t size = jsont_data_value(S, &bytes);
+ size_t expectedSize = strlen(expectedData);
+ // printf("expectedData: '%s'\n", expectedData);
+ // printf("currentData: '%.*s'\n", (int)size, (const char*)bytes);
+ assert(size == expectedSize);
+ int d = memcmp((const void*)expectedData, bytes, size);
+ assert(d == 0);
+
+ // Expect a string value "Foo"
+ tok = jsont_next(S);
+ assert(tok == JSONT_STRING);
+ char* str = jsont_strcpy_value(S);
+ assert(str != 0);
+ assert(strcmp(str, "Foo") == 0);
+ free(str); str = 0;
+
+ // Expect field name "1". Also tests the integrity of jsont_data_equals
+ tok = jsont_next(S);
+ assert(jsont_data_equals(S, (const uint8_t*)"1", 1) == true);
+ assert(jsont_str_equals(S, "1") == true);
+ size = jsont_data_value(S, &bytes);
+ assert(size == 1);
+ assert(memcmp((const void*)"1", (const void*)bytes, 1) == 0);
+
+ // Expect the string '\u2192' (RIGHTWARDS ARROW, UTF8: E2,86,92)
+ assert(jsont_next(S) == JSONT_STRING);
+ assert(jsont_str_equals(S, "\xe2\x86\x92") == true);
+
+ // Expect a field name 'n'
+ jsont_next(S);
+ JSONT_ASSERT_FIELD_NAME("n");
+
+ // Expect a number value '1234'
+ assert(jsont_next(S) == JSONT_NUMBER_INT);
+ //printf("int: %lld (str: '%s')\n", jsont_int_value(S), jsont_strcpy_value(S));
+ assert(jsont_int_value(S) == 1234LL);
+ assert(jsont_float_value(S) == 1234.0);
+
+ // Expect a field name 'x'
+ jsont_next(S);
+ JSONT_ASSERT_FIELD_NAME("x");
+
+ // Expect a number value '12.34'
+ assert(jsont_next(S) == JSONT_NUMBER_FLOAT);
+ assert(jsont_float_value(S) == 12.34);
+ assert(jsont_int_value(S) == 12LL); // partial expected
+
+ jsont_next(S);
+ JSONT_ASSERT_FIELD_NAME("overflow");
+
+ // Expect a cut-off integer value of INT64_MAX
+ assert(jsont_next(S) == JSONT_NUMBER_INT);
+ assert(jsont_int_value(S) == INT64_MAX);
+
+ // Expect a valid floating point value (although it will have less-than
+ // perfect precision)
+ assert(!isnan(jsont_float_value(S)));
+
+ // Expect a field name 'bar'
+ jsont_next(S);
+ JSONT_ASSERT_FIELD_NAME("b/a/r");
+
+ // Expect start of array
+ assert(jsont_next(S) == JSONT_ARRAY_START);
+
+ // Expect null, true and false
+ assert(jsont_next(S) == JSONT_NULL);
+ assert(jsont_next(S) == JSONT_TRUE);
+ assert(jsont_next(S) == JSONT_FALSE);
+
+ // { "x": 12.3 }
+ assert(jsont_next(S) == JSONT_OBJECT_START);
+ jsont_next(S);
+ JSONT_ASSERT_FIELD_NAME("x");
+ assert(jsont_next(S) == JSONT_NUMBER_FLOAT);
+ assert(jsont_float_value(S) == 12.3);
+ assert(jsont_next(S) == JSONT_OBJECT_END);
+
+ // 123, "456", "a\"b\""
+ assert(jsont_next(S) == JSONT_NUMBER_INT);
+ assert(jsont_int_value(S) == 123);
+
+ assert(jsont_next(S) == JSONT_STRING);
+ assert(jsont_str_equals(S, "456") == true);
+
+ assert(jsont_next(S) == JSONT_STRING);
+ assert(jsont_str_equals(S, "a\"b\"") == true);
+
+ // "a\u0000b"
+ assert(jsont_next(S) == JSONT_STRING);
+ const uint8_t b3[] = {'a',0,'b'};
+ assert(jsont_data_equals(S, b3, sizeof(b3)) == true);
+
+ // "a\{b,f,n,r,t}b"
+ assert(jsont_next(S) == JSONT_STRING);
+ assert(jsont_str_equals(S, "a\bb") == true);
+ assert(jsont_next(S) == JSONT_STRING);
+ assert(jsont_str_equals(S, "a\fb") == true);
+ assert(jsont_next(S) == JSONT_STRING);
+ assert(jsont_str_equals(S, "a\nb") == true);
+ assert(jsont_next(S) == JSONT_STRING);
+ assert(jsont_str_equals(S, "a\rb") == true);
+ assert(jsont_next(S) == JSONT_STRING);
+ assert(jsont_str_equals(S, "a\tb") == true);
+
+ // ""
+ assert(jsont_next(S) == JSONT_STRING);
+ assert(jsont_str_equals(S, "") == true);
+ assert(jsont_str_equals(S, " ") == false);
+
+ // " "
+ assert(jsont_next(S) == JSONT_STRING);
+ assert(jsont_str_equals(S, " ") == true);
+ assert(jsont_str_equals(S, "") == false);
+
+ // ] }
+ assert(jsont_next(S) == JSONT_ARRAY_END);
+ assert(jsont_next(S) == JSONT_OBJECT_END);
+
+
+ jsont_destroy(S);
+ printf("PASS\n");
+ return 0;
+}