Austin Schuh | f417eaf | 2019-09-16 21:58:36 -0700 | [diff] [blame] | 1 | // JSON Tokenizer and builder. Copyright (c) 2012, Rasmus Andersson. All rights |
| 2 | // reserved. Use of this source code is governed by a MIT-style license that can |
| 3 | // be found in the LICENSE file. |
| 4 | #ifndef JSONT_CXX_INCLUDED |
| 5 | #define JSONT_CXX_INCLUDED |
| 6 | |
| 7 | #include <stdint.h> // uint8_t, int64_t |
| 8 | #include <stdlib.h> // size_t |
| 9 | #include <string.h> // strlen |
| 10 | #include <stdbool.h> // bool |
| 11 | #include <math.h> |
| 12 | #include <assert.h> |
| 13 | #include <string> |
| 14 | #include <stdexcept> |
| 15 | |
| 16 | // Can haz rvalue references with move semantics? |
| 17 | #if (defined(_MSC_VER) && _MSC_VER >= 1600) || \ |
| 18 | (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__) || \ |
| 19 | (defined(__has_feature) && __has_feature(cxx_rvalue_references)) |
| 20 | #define JSONT_CXX_RVALUE_REFS 1 |
| 21 | #else |
| 22 | #define JSONT_CXX_RVALUE_REFS 0 |
| 23 | #endif |
| 24 | |
| 25 | namespace jsont { |
| 26 | |
| 27 | // Tokens |
| 28 | typedef enum { |
| 29 | End = 0, // Input ended |
| 30 | ObjectStart, // { |
| 31 | ObjectEnd, // } |
| 32 | ArrayStart, // [ |
| 33 | ArrayEnd, // ] |
| 34 | True, // true |
| 35 | False, // false |
| 36 | Null, // null |
| 37 | Integer, // number value without a fraction part |
| 38 | Float, // number value with a fraction part |
| 39 | String, // string value |
| 40 | FieldName, // field name |
| 41 | Error, // An error occured (see `error()` for details) |
| 42 | _Comma, |
| 43 | } Token; |
| 44 | |
| 45 | // String encoding |
| 46 | typedef enum { |
| 47 | UTF8TextEncoding = 0, |
| 48 | } TextEncoding; |
| 49 | |
| 50 | // Name of `token` |
| 51 | const char* token_name(jsont::Token token); |
| 52 | |
| 53 | class TokenizerInternal; |
| 54 | |
| 55 | // Reads a sequence of bytes and produces tokens and values while doing so |
| 56 | class Tokenizer { |
| 57 | public: |
| 58 | Tokenizer(const char* bytes, size_t length, TextEncoding encoding); |
| 59 | ~Tokenizer(); |
| 60 | |
| 61 | // Read next token |
| 62 | const Token& next(); |
| 63 | |
| 64 | // Access current token |
| 65 | const Token& current() const; |
| 66 | |
| 67 | // Reset the tokenizer, making it possible to reuse this parser so to avoid |
| 68 | // unnecessary memory allocation and deallocation. |
| 69 | void reset(const char* bytes, size_t length, TextEncoding encoding); |
| 70 | |
| 71 | // True if the current token has a value |
| 72 | bool hasValue() const; |
| 73 | |
| 74 | // Returns a slice of the input which represents the current value, or nothing |
| 75 | // (returns 0) if the current token has no value (e.g. start of an object). |
| 76 | size_t dataValue(const char const** bytes) const; |
| 77 | |
| 78 | // Returns a *copy* of the current string value. |
| 79 | std::string stringValue() const; |
| 80 | |
| 81 | // Returns the current value as a double-precision floating-point number. |
| 82 | double floatValue() const; |
| 83 | |
| 84 | // Returns the current value as a signed 64-bit integer. |
| 85 | int64_t intValue() const; |
| 86 | |
| 87 | // Returns the current value as a boolean |
| 88 | bool boolValue() const; |
| 89 | |
| 90 | // Error codes |
| 91 | typedef enum { |
| 92 | UnspecifiedError = 0, |
| 93 | UnexpectedComma, |
| 94 | UnexpectedTrailingComma, |
| 95 | InvalidByte, |
| 96 | PrematureEndOfInput, |
| 97 | MalformedUnicodeEscapeSequence, |
| 98 | MalformedNumberLiteral, |
| 99 | UnterminatedString, |
| 100 | SyntaxError, |
| 101 | } ErrorCode; |
| 102 | |
| 103 | // Returns the error code of the last error |
| 104 | ErrorCode error() const; |
| 105 | |
| 106 | // Returns a human-readable message for the last error. Never returns NULL. |
| 107 | const char* errorMessage() const; |
| 108 | |
| 109 | // The byte offset into input where the tokenizer is currently looking. In the |
| 110 | // event of an error, this will point to the source of the error. |
| 111 | size_t inputOffset() const; |
| 112 | |
| 113 | // Total number of input bytes |
| 114 | size_t inputSize() const; |
| 115 | |
| 116 | // A pointer to the input data as passed to `reset` or the constructor. |
| 117 | const char* inputBytes() const; |
| 118 | |
| 119 | friend class TokenizerInternal; |
| 120 | private: |
| 121 | size_t availableInput() const; |
| 122 | size_t endOfInput() const; |
| 123 | const Token& setToken(Token t); |
| 124 | const Token& setError(ErrorCode error); |
| 125 | |
| 126 | struct { |
| 127 | const uint8_t* bytes; |
| 128 | size_t length; |
| 129 | size_t offset; |
| 130 | } _input; |
| 131 | struct Value { |
| 132 | Value() : offset(0), length(0), buffered(false) {} |
| 133 | void beginAtOffset(size_t z); |
| 134 | size_t offset; // into _input.bytes |
| 135 | size_t length; |
| 136 | std::string buffer; |
| 137 | bool buffered; // if true, contents lives in buffer |
| 138 | } _value; |
| 139 | Token _token; |
| 140 | struct { |
| 141 | ErrorCode code; |
| 142 | } _error; |
| 143 | }; |
| 144 | |
| 145 | |
| 146 | // Helps in building JSON, providing a final sequential byte buffer |
| 147 | class Builder { |
| 148 | public: |
| 149 | Builder() : _buf(0), _capacity(0), _size(0), _state(NeutralState) {} |
| 150 | ~Builder() { if (_buf) { free(_buf); _buf = 0; } } |
| 151 | Builder(const Builder& other); |
| 152 | Builder& operator=(const Builder& other); |
| 153 | #if JSONT_CXX_RVALUE_REFS |
| 154 | Builder(Builder&& other); |
| 155 | Builder& operator=(Builder&& other); |
| 156 | #endif |
| 157 | |
| 158 | Builder& startObject(); |
| 159 | Builder& endObject(); |
| 160 | Builder& startArray(); |
| 161 | Builder& endArray(); |
| 162 | Builder& fieldName(const char* v, size_t length, TextEncoding e=UTF8TextEncoding); |
| 163 | Builder& fieldName(const std::string& name, TextEncoding enc=UTF8TextEncoding); |
| 164 | Builder& value(const char* v, size_t length, TextEncoding e=UTF8TextEncoding); |
| 165 | Builder& value(const char* v); |
| 166 | Builder& value(const std::string& v); |
| 167 | Builder& value(double v); |
| 168 | Builder& value(int64_t v); |
| 169 | Builder& value(int v); |
| 170 | Builder& value(unsigned int v); |
| 171 | Builder& value(long v); |
| 172 | Builder& value(bool v); |
| 173 | Builder& nullValue(); |
| 174 | |
| 175 | size_t size() const; |
| 176 | const char* bytes() const; |
| 177 | std::string toString() const; |
| 178 | const char* seizeBytes(size_t& size_out); |
| 179 | const void reset(); |
| 180 | |
| 181 | private: |
| 182 | size_t available() const; |
| 183 | void reserve(size_t size); |
| 184 | void prefix(); |
| 185 | Builder& appendString(const uint8_t* v, size_t length, TextEncoding enc); |
| 186 | Builder& appendChar(char byte); |
| 187 | |
| 188 | char* _buf; |
| 189 | size_t _capacity; |
| 190 | size_t _size; |
| 191 | enum { |
| 192 | NeutralState = 0, |
| 193 | AfterFieldName, |
| 194 | AfterValue, |
| 195 | AfterObjectStart, |
| 196 | AfterArrayStart, |
| 197 | } _state; |
| 198 | }; |
| 199 | |
| 200 | |
| 201 | // Convenience function |
| 202 | inline Builder build() { return Builder(); } |
| 203 | |
| 204 | |
| 205 | // ------------------- internal --------------------- |
| 206 | |
| 207 | inline Tokenizer::Tokenizer(const char* bytes, size_t length, |
| 208 | TextEncoding encoding) : _token(End) { |
| 209 | reset(bytes, length, encoding); |
| 210 | } |
| 211 | |
| 212 | inline const Token& Tokenizer::current() const { return _token; } |
| 213 | |
| 214 | inline bool Tokenizer::hasValue() const { |
| 215 | return _token >= Integer && _token <= FieldName; |
| 216 | } |
| 217 | |
| 218 | inline std::string Tokenizer::stringValue() const { |
| 219 | const char* bytes; |
| 220 | size_t size = dataValue(&bytes); |
| 221 | return std::string(bytes, size); |
| 222 | } |
| 223 | |
| 224 | inline bool Tokenizer::boolValue() const { |
| 225 | return _token == True; |
| 226 | } |
| 227 | |
| 228 | inline size_t Tokenizer::availableInput() const { |
| 229 | return _input.length - _input.offset; |
| 230 | } |
| 231 | inline size_t Tokenizer::endOfInput() const { |
| 232 | return _input.offset == _input.length; |
| 233 | } |
| 234 | inline const Token& Tokenizer::setToken(Token t) { |
| 235 | return _token = t; |
| 236 | } |
| 237 | inline const Token& Tokenizer::setError(Tokenizer::ErrorCode error) { |
| 238 | _error.code = error; |
| 239 | return _token = Error; |
| 240 | } |
| 241 | inline size_t Tokenizer::inputOffset() const { |
| 242 | return _input.offset; |
| 243 | } |
| 244 | inline size_t Tokenizer::inputSize() const { |
| 245 | return _input.length; |
| 246 | } |
| 247 | inline const char* Tokenizer::inputBytes() const { |
| 248 | return (const char*)_input.bytes; |
| 249 | } |
| 250 | |
| 251 | inline void Tokenizer::Value::beginAtOffset(size_t z) { |
| 252 | offset = z; |
| 253 | length = 0; |
| 254 | buffered = false; |
| 255 | } |
| 256 | |
| 257 | inline Tokenizer::ErrorCode Tokenizer::error() const { |
| 258 | return _error.code; |
| 259 | } |
| 260 | |
| 261 | |
| 262 | inline Builder& Builder::startObject() { |
| 263 | prefix(); |
| 264 | _state = AfterObjectStart; |
| 265 | return appendChar('{'); |
| 266 | } |
| 267 | |
| 268 | inline Builder& Builder::endObject() { |
| 269 | _state = AfterValue; |
| 270 | return appendChar('}'); |
| 271 | } |
| 272 | |
| 273 | inline Builder& Builder::startArray() { |
| 274 | prefix(); |
| 275 | _state = AfterArrayStart; |
| 276 | return appendChar('['); |
| 277 | } |
| 278 | |
| 279 | inline Builder& Builder::endArray() { |
| 280 | _state = AfterValue; |
| 281 | return appendChar(']'); |
| 282 | } |
| 283 | |
| 284 | inline Builder& Builder::fieldName(const std::string& name, TextEncoding enc) { |
| 285 | return fieldName(name.data(), name.size(), enc); |
| 286 | } |
| 287 | |
| 288 | inline Builder& Builder::fieldName(const char* v, size_t length, |
| 289 | TextEncoding enc) { |
| 290 | prefix(); |
| 291 | _state = AfterFieldName; |
| 292 | return appendString((const uint8_t*)v, length, enc); |
| 293 | } |
| 294 | |
| 295 | inline Builder& Builder::value(const char* v, size_t length, TextEncoding enc) { |
| 296 | prefix(); |
| 297 | _state = AfterValue; |
| 298 | return appendString((const uint8_t*)v, length, enc); |
| 299 | } |
| 300 | |
| 301 | inline Builder& Builder::value(const char* v) { |
| 302 | return value(v, strlen(v)); |
| 303 | } |
| 304 | |
| 305 | inline Builder& Builder::value(const std::string& v) { |
| 306 | return value(v.data(), v.size()); |
| 307 | } |
| 308 | |
| 309 | inline Builder& Builder::value(double v) { |
| 310 | prefix(); |
| 311 | reserve(256); |
| 312 | int z = snprintf(_buf+_size, 256, "%g", v); |
| 313 | assert(z < 256); |
| 314 | _size += z; |
| 315 | _state = AfterValue; |
| 316 | return *this; |
| 317 | } |
| 318 | |
| 319 | inline Builder& Builder::value(int64_t v) { |
| 320 | prefix(); |
| 321 | reserve(21); |
| 322 | int z = snprintf(_buf+_size, 21, "%lld", v); |
| 323 | assert(z < 21); |
| 324 | _size += z; |
| 325 | _state = AfterValue; |
| 326 | return *this; |
| 327 | } |
| 328 | |
| 329 | inline Builder& Builder::value(int v) { return value((int64_t)v); } |
| 330 | inline Builder& Builder::value(unsigned int v) { return value((int64_t)v); } |
| 331 | inline Builder& Builder::value(long v) { return value((int64_t)v); } |
| 332 | |
| 333 | inline Builder& Builder::value(bool v) { |
| 334 | prefix(); |
| 335 | if (v) { |
| 336 | reserve(4); |
| 337 | _buf[_size] = 't'; |
| 338 | _buf[++_size] = 'r'; |
| 339 | _buf[++_size] = 'u'; |
| 340 | _buf[++_size] = 'e'; |
| 341 | ++_size; |
| 342 | } else { |
| 343 | reserve(5); |
| 344 | _buf[_size] = 'f'; |
| 345 | _buf[++_size] = 'a'; |
| 346 | _buf[++_size] = 'l'; |
| 347 | _buf[++_size] = 's'; |
| 348 | _buf[++_size] = 'e'; |
| 349 | ++_size; |
| 350 | } |
| 351 | _state = AfterValue; |
| 352 | return *this; |
| 353 | } |
| 354 | |
| 355 | inline Builder& Builder::nullValue() { |
| 356 | prefix(); |
| 357 | reserve(4); |
| 358 | _buf[_size] = 'n'; |
| 359 | _buf[++_size] = 'u'; |
| 360 | _buf[++_size] = 'l'; |
| 361 | _buf[++_size] = 'l'; |
| 362 | ++_size; |
| 363 | _state = AfterValue; |
| 364 | return *this; |
| 365 | } |
| 366 | |
| 367 | inline size_t Builder::size() const { return _size; } |
| 368 | inline const char* Builder::bytes() const { return _buf; } |
| 369 | inline std::string Builder::toString() const { |
| 370 | return std::string(bytes(), size()); |
| 371 | } |
| 372 | inline const char* Builder::seizeBytes(size_t& size_out) { |
| 373 | const char* buf = _buf; |
| 374 | size_out = _size; |
| 375 | _buf = 0; |
| 376 | _capacity = 0; |
| 377 | reset(); |
| 378 | return buf; |
| 379 | } |
| 380 | inline const void Builder::reset() { |
| 381 | _size = 0; |
| 382 | _state = NeutralState; |
| 383 | } |
| 384 | |
| 385 | inline size_t Builder::available() const { |
| 386 | return _capacity - _size; |
| 387 | } |
| 388 | |
| 389 | inline void Builder::reserve(size_t size) { |
| 390 | if (available() < size) { |
| 391 | #if 0 |
| 392 | // exact allocation for debugging purposes |
| 393 | printf("DEBUG Builder::reserve: size=%zu available=%zu grow_by=%zu\n", |
| 394 | size, available(), (size - available()) ); |
| 395 | _capacity += size - available(); |
| 396 | #else |
| 397 | _capacity += size - available(); |
| 398 | _capacity = (_capacity < 64) ? 64 : (_capacity * 1.5); |
| 399 | #endif |
| 400 | _buf = (char*)realloc((void*)_buf, _capacity); |
| 401 | } |
| 402 | } |
| 403 | |
| 404 | inline void Builder::prefix() { |
| 405 | if (_state == AfterFieldName) { |
| 406 | appendChar(':'); |
| 407 | } else if (_state == AfterValue) { |
| 408 | appendChar(','); |
| 409 | } |
| 410 | } |
| 411 | |
| 412 | inline Builder& Builder::appendChar(char byte) { |
| 413 | reserve(1); |
| 414 | _buf[_size++] = byte; |
| 415 | return *this; |
| 416 | } |
| 417 | |
| 418 | } |
| 419 | |
| 420 | #endif // JSONT_CXX_INCLUDED |