Squashed 'third_party/jsont/' content from commit 1536152d7 Change-Id: I51a80190772b74ca0d45fd3fadc130e872b57cc0 git-subtree-dir: third_party/jsont git-subtree-split: 1536152d7c1926448d42e4a691acd9a15940b20c

commit: f417eaf93c086c0695adeb0e9cacd44e7e537b6a [log] [tgz]
author: Austin Schuh <austin.linux@gmail.com> Mon Sep 16 21:58:36 2019 -0700
committer: Austin Schuh <austin.linux@gmail.com> Mon Sep 16 21:58:36 2019 -0700
tree: 1b9c3b952bf1501aacc99fbfd40e7a2c730c1b2d
diff --git a/jsont.cc b/jsont.cc
new file mode 100644
index 0000000..09b1e45
--- /dev/null
+++ b/jsont.cc

@@ -0,0 +1,561 @@
+#include "jsont.hh"
+
+namespace jsont {
+
+static const int8_t kHexValueTable[55] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 0-0
+  -1, -1, -1, -1, -1, -1, -1,
+  10, 11, 12, 13, 14, 15, // A-F
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+  -1, -1, -1, -1, -1, -1, -1,
+  10, 11, 12, 13, 14, 15 // a-f
+};
+
+static uint64_t _xtou64(const uint8_t* bytes, size_t len) {
+  uint64_t value = 0;
+  uint64_t cutoff = UINT64_MAX / 16;
+  int cutoff_digit = (int)(UINT64_MAX - cutoff * 16);
+
+  for (size_t i = 0; i != len; ++i) {
+    uint8_t b = bytes[i];
+    int8_t digit = (b > '0'-1 && b < 'f'+1) ? kHexValueTable[b-'0'] : -1;
+    if (b == -1 || // bad digit
+        (value > cutoff) || // overflow
+        ((value == cutoff) && (digit > cutoff_digit)) ) {
+      return UINT64_MAX;
+    } else {
+      value = (value * 16) + digit;
+    }
+  }
+
+  return value;
+}
+
+
+#ifdef NAN
+  #define _JSONT_NAN NAN
+#else
+  #define _JSONT_NAN nan(0)
+#endif
+
+
+const char* token_name(jsont::Token tok) {
+  switch (tok) {
+    case End:         return "End";
+    case ObjectStart: return "ObjectStart";
+    case ObjectEnd:   return "ObjectEnd";
+    case ArrayStart:  return "ArrayStart";
+    case ArrayEnd:    return "ArrayEnd";
+    case True:        return "True";
+    case False:       return "False";
+    case Null:        return "Null";
+    case Integer:     return "Integer";
+    case Float:       return "Float";
+    case String:      return "String";
+    case FieldName:   return "FieldName";
+    default:                 return "?";
+  }
+}
+
+
+class TokenizerInternal {
+public:
+  inline static const uint8_t* currentInput(const Tokenizer& self) {
+    return self._input.bytes + self._input.offset;
+  }
+
+  inline static const Token& readAtom(Tokenizer& self, const char* str,
+        size_t len, const Token& token) {
+    if (self.availableInput() < len) {
+      return self.setError(Tokenizer::PrematureEndOfInput);
+    } else if (memcmp(currentInput(self), str, len) != 0) {
+      return self.setError(Tokenizer::InvalidByte);
+    } else {
+      self._input.offset += len;
+      return self.setToken(token);
+    }
+  }
+};
+
+
+Tokenizer::~Tokenizer() {}
+
+
+void Tokenizer::reset(const char* bytes, size_t length, TextEncoding encoding) {
+  assert(encoding == UTF8TextEncoding); // only supported encoding
+  _input.bytes = (const uint8_t*)bytes;
+  _input.length = length;
+  _input.offset = 0;
+  _error.code = UnspecifiedError;
+  // Advance to first token
+  next();
+}
+
+
+const char* Tokenizer::errorMessage() const {
+  switch (_error.code) {
+    case UnexpectedComma:
+      return "Unexpected comma";
+    case UnexpectedTrailingComma:
+      return "Unexpected trailing comma";
+    case InvalidByte:
+      return "Invalid input byte";
+    case PrematureEndOfInput:
+      return "Premature end of input";
+    case MalformedUnicodeEscapeSequence:
+      return "Malformed Unicode escape sequence";
+    case MalformedNumberLiteral:
+      return "Malformed number literal";
+    case UnterminatedString:
+      return "Unterminated string";
+    case SyntaxError:
+      return "Illegal JSON (syntax error)";
+    default:
+      return "Unspecified error";
+  }
+}
+
+
+size_t Tokenizer::dataValue(const char const** bytes) const {
+  if (!hasValue()) { return 0; }
+  if (_value.buffered) {
+    *bytes = (const char const*)_value.buffer.data();
+    return _value.buffer.size();
+  } else {
+    *bytes = (const char const*)(_input.bytes + _value.offset);
+    return _value.length;
+  }
+}
+
+
+double Tokenizer::floatValue() const {
+  if (!hasValue()) {
+    return _token == jsont::True ? 1.0 : 0.0;
+  }
+
+  const char* bytes;
+
+  if (_value.buffered) {
+    // edge-case since only happens with string values using escape sequences
+    bytes = _value.buffer.c_str();
+  } else {
+    bytes = (const char*)_input.bytes + _value.offset;
+    if (availableInput() == 0) {
+      // In this case where the data lies at the edge of the buffer, we can't pass
+      // it directly to atof, since there will be no sentinel byte. We are fine
+      // with a copy, since this is an edge case (only happens either for broken
+      // JSON or when the whole document is just a number).
+      char* buf[128];
+      if (_value.length > 127) {
+        // We are unable to interpret such a large literal in this edge-case
+        return _JSONT_NAN;
+      }
+      memcpy((void*)buf, (const void*)bytes, _value.length);
+      buf[_value.length] = '\0';
+      return strtod((const char*)buf, (char**)0);
+    }
+  }
+
+  return strtod(bytes, (char**)0);
+}
+
+
+int64_t Tokenizer::intValue() const {
+  if (!hasValue()) {
+    return _token == jsont::True ? 1LL : 0LL;
+  }
+
+  const char* bytes;
+
+  if (_value.buffered) {
+    // edge-case since only happens with string values using escape sequences
+    bytes = _value.buffer.c_str();
+  } else {
+    bytes = (const char*)_input.bytes + _value.offset;
+    if (availableInput() == 0) {
+      // In this case where the data lies at the edge of the buffer, we can't pass
+      // it directly to atof, since there will be no sentinel byte. We are fine
+      // with a copy, since this is an edge case (only happens either for broken
+      // JSON or when the whole document is just a number).
+      char* buf[21];
+      if (_value.length > 20) {
+        // We are unable to interpret such a large literal in this edge-case
+        return 0;
+      }
+      memcpy((void*)buf, (const void*)bytes, _value.length);
+      buf[_value.length] = '\0';
+      return strtoll((const char*)buf, (char**)0, 10);
+    }
+  }
+
+  return strtoll(bytes, (char**)0, 10);
+}
+
+
+const Token& Tokenizer::next() {
+  //
+  // { } [ ] n t f "
+  //         | | | |
+  //         | | | +- /[^"]*/ "
+  //         | | +- a l s e
+  //         | +- r u e
+  //         +- u l l
+  //
+  while (!endOfInput()) {
+    uint8_t b = _input.bytes[_input.offset++];
+    switch (b) {
+      case '{': return setToken(ObjectStart);
+      case '}': {
+        if (_token == _Comma) { return setError(UnexpectedTrailingComma); }
+        return setToken(ObjectEnd);
+      }
+
+      case '[': return setToken(ArrayStart);
+      case ']': {
+        if (_token == _Comma) { return setError(UnexpectedTrailingComma); }
+        return setToken(ArrayEnd);
+      }
+
+      case 'n':
+        return TokenizerInternal::readAtom(*this, "ull", 3, jsont::Null);
+      case 't':
+        return TokenizerInternal::readAtom(*this, "rue", 3, jsont::True);
+      case 'f':
+        return TokenizerInternal::readAtom(*this, "alse", 4, jsont::False);
+
+      case ' ': case '\t': case '\r': case '\n': // IETF RFC4627
+        // ignore whitespace and let the outer "while" do its thing
+        break;
+
+      case 0:
+        return setError(InvalidByte);
+
+      // when we read a value, we don't produce a token until we either reach
+      // end of input, a colon (then the value is a field name), a comma, or an
+      // array or object terminator.
+
+      case '"': {
+        _value.beginAtOffset(_input.offset);
+
+        while (!endOfInput()) {
+          b = _input.bytes[_input.offset++];
+          assert(_input.offset < _input.length);
+          
+          switch (b) {
+
+            case '\\': {
+              // We must go buffered since the input segment != value
+              if (!_value.buffered) {
+                _value.buffered = true;
+                _value.buffer.assign(
+                  (const char*)(_input.bytes+_value.offset),
+                  _input.offset - _value.offset - 1
+                );
+              }
+
+              if (endOfInput()) {
+                return setError(PrematureEndOfInput);
+              }
+              
+              b = _input.bytes[_input.offset++];
+              switch (b) {
+                case 'b': _value.buffer.append(1, '\x08'); break;
+                case 'f': _value.buffer.append(1, '\x0C'); break;
+                case 'n': _value.buffer.append(1, '\x0A'); break;
+                case 'r': _value.buffer.append(1, '\x0D'); break;
+                case 't': _value.buffer.append(1, '\x09'); break;
+                case 'u': {
+                  // \uxxxx
+                  if (availableInput() < 4) {
+                    return setError(PrematureEndOfInput);
+                  }
+
+                  uint64_t utf16cp =
+                    _xtou64(TokenizerInternal::currentInput(*this), 4);
+                  _input.offset += 4;
+
+                  if (utf16cp > 0xffff) {
+                    return setError(MalformedUnicodeEscapeSequence);
+                  }
+
+                  uint16_t cp = (uint16_t)(0xffff & utf16cp);
+
+                  // Append UTF-8 byte(s) representing the Unicode codepoint cp
+                  if (cp < 0x80) {
+                    // U+0000 - U+007F
+                    uint8_t cp8 = ((uint8_t)cp);
+                    _value.buffer.append(1, (char)cp8);
+                  } else if (cp < 0x800) {
+                    // U+0080 - U+07FF
+                    uint8_t cp8 = (uint8_t)((cp >> 6) | 0xc0);
+                    _value.buffer.append(1, (char)cp8);
+                    cp8 = (uint8_t)((cp & 0x3f) | 0x80);
+                    _value.buffer.append(1, (char)cp8);
+                  } else if (cp >= 0xD800u && cp <= 0xDFFFu) {
+                    // UTF-16 Surrogate pairs -- according to the UTF-8
+                    // definition (RFC 3629) the high and low surrogate halves
+                    // used by UTF-16 (U+D800 through U+DFFF) are not legal
+                    // Unicode values, and the UTF-8 encoding of them is an
+                    // invalid byte sequence. Instead of throwing an error, we
+                    // substitute this character with the replacement character
+                    // U+FFFD (UTF-8: EF,BF,BD).
+                    _value.buffer.append("\xEF\xBF\xBD");
+                    // 
+                  } else {
+                    // U+0800 - U+FFFF
+                    uint8_t cp8 = (uint8_t)((cp >> 12) | 0xe0);
+                    _value.buffer.append(1, (char)cp8);
+                    cp8 = (uint8_t)(((cp >> 6) & 0x3f) | 0x80);
+                    _value.buffer.append(1, (char)cp8);
+                    cp8 = (uint8_t)((cp & 0x3f) | 0x80);
+                    _value.buffer.append(1, (char)cp8);
+                  }
+
+                  break;
+                }
+                default:
+                  _value.buffer.append(1, (char)b); break;
+              }
+              break;
+            }
+
+            case '"':
+              goto after_initial_read_b;
+
+            case 0:
+              return setError(InvalidByte);
+
+            default: {
+              if (_value.buffered) {
+                // TODO: Make this efficient by appending chunks between
+                // boundaries instead of appending per-byte
+                _value.buffer.append(1, (char)b);
+              }
+              break;
+            }
+          } // switch(b)
+        } // while (!endOfInput())
+
+        after_initial_read_b:
+        if (b != '"') {
+          return setError(UnterminatedString);
+        }
+
+        if (!_value.buffered) {
+          _value.length = _input.offset - _value.offset - 1;
+        }
+
+        // is this a field name?
+        while (!endOfInput()) {
+          b = _input.bytes[_input.offset++];
+          switch (b) {
+            case ' ': case '\t': case '\r': case '\n': break;
+            case ':': return setToken(FieldName);
+            case ',': goto string_read_return_string;
+            case ']': case '}': {
+              --_input.offset; // rewind
+              goto string_read_return_string;
+            }
+            case 0: return setError(InvalidByte);
+            default: {
+              // Expected a comma or a colon
+              return setError(SyntaxError);
+            }
+          }
+        }
+
+        string_read_return_string:
+        return setToken(jsont::String);
+      }
+
+      case ',': {
+        if (_token == ObjectStart || _token == ArrayStart || _token == _Comma) {
+          return setError(UnexpectedComma);
+        }
+        _token = _Comma;
+        break;
+      }
+
+      default: {
+        if (isdigit((int)b) || b == '+' || b == '-') {
+          // We are reading a number
+          _value.beginAtOffset(_input.offset-1);
+          Token token = jsont::Integer;
+
+          while (!endOfInput()) {
+            b = _input.bytes[_input.offset++];
+            switch (b) {
+              case '0'...'9': break;
+              case '.': token = jsont::Float; break;
+              case 'E': case 'e': case '-': case '+': {
+                if (token != jsont::Float) {
+                  return setError(MalformedNumberLiteral);
+                }
+                break;
+              }
+              default: {
+                if ( (_input.offset - _value.offset == 1) &&
+                     (_input.bytes[_value.offset] == '-' || 
+                      _input.bytes[_value.offset] == '+') ) {
+                  return setError(MalformedNumberLiteral);
+                }
+
+                // rewind the byte that terminated this number literal
+                --_input.offset;
+
+                _value.length = _input.offset - _value.offset - 1;
+                return setToken(token);
+              }
+            }
+          }
+          return setToken(End);
+        } else {
+          return setError(InvalidByte);
+        }
+      }
+    }
+  }
+
+  return setToken(End);
+}
+
+
+enum {
+  kUTF8ByteVerbatim = 0,
+  kUTF8ByteEncode1, // "\u000x"
+  kUTF8ByteEncode2, // "\u00xx"
+};
+#define V kUTF8ByteVerbatim
+#define E1 kUTF8ByteEncode1
+#define E2 kUTF8ByteEncode2
+static const uint8_t kUTF8ByteTable[256] = {
+  E1, E1, E1, E1, E1, E1, E1, E1, 'b', 't', 'n', E1, 'f', 'r', E1, E1, E2, E2,
+  E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, V, V, '"', V, V, V, V,
+  V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+  V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+  V, '\\', V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+  V, V, V, V, V, V, V, V, V, V, V, E2, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+  V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+  V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+  V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+  V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
+  V, V, V, V, V, V, V, V, V, V
+};
+#undef V
+#undef E1
+#undef E2
+
+// #ifndef __has_feature
+//   #define __has_feature(x) 0
+// #endif
+// #if defined(__cplusplus) && __has_feature(cxx_static_assert)
+//   #define JSONT_CONST_ASSERT(expr, error_msg) static_assert((expr), (error_msg))
+// #elif __has_feature(c_static_assert)
+//   #define JSONT_CONST_ASSERT(expr, error_msg) _Static_assert((expr), (error_msg))
+// #else
+//   #define JSONT_CONST_ASSERT(expr, error_msg) ((void)0)
+// #endif
+
+Builder& Builder::appendString(const uint8_t* v, size_t length, TextEncoding encoding) {
+  reserve(length + 2);
+  _buf[_size++] = '"';
+
+  assert(encoding == UTF8TextEncoding /* Currently only UTF-8 is supported */);
+
+  const uint8_t* end = v+length;
+  while (v != end) {
+    uint8_t s = kUTF8ByteTable[*v];
+    switch (s) {
+      case kUTF8ByteVerbatim:
+        _buf[_size++] = *v;
+        break;
+      case kUTF8ByteEncode1: {
+        assert(*v < 16);
+        size_t remainingSize = end-v+1+5; // five additional bytes needed
+        reserve(remainingSize);
+        _buf[_size] = '\\';
+        _buf[++_size] = 'u';
+        _buf[++_size] = '0';
+        _buf[++_size] = '0';
+        _buf[++_size] = '0';
+        _buf[++_size] = *v + (*v > 10 ? 55 : 48); // A-F : 0-9
+        ++_size;
+        assert(_size <= _capacity);
+        break;
+      }
+      case kUTF8ByteEncode2: {
+        // Note: *v is guaranteed to be within the set [16,32),127. This is
+        // an affect of the kUTF8ByteTable lookup table and this code needs to
+        // be revised if the lookup table adds or removes any kUTF8ByteEncode.
+        assert((*v > 15 && *v < 32) || *v == 127);
+        size_t remainingSize = end-v+1+5; // five additional bytes needed
+        reserve(remainingSize);
+        _buf[_size] = '\\';
+        _buf[++_size] = 'u';
+        _buf[++_size] = '0';
+        _buf[++_size] = '0';
+        uint8_t b1 = (*v & 0xf0) / 16;
+        //uint8_t b1 = (*v & 0xf0) >> 4; // slightly faster but LE-specific
+        uint8_t b2 = *v & 0x0f;
+        _buf[++_size] = b1 + (b1 > 10 ? 55 : 48); // A-F : 0-9
+        _buf[++_size] = b2 + (b2 > 10 ? 55 : 48); // A-F : 0-9
+        ++_size;
+        assert(_size <= _capacity);
+        break;
+      }
+      default:
+        // reverse solidus escape
+        size_t remainingSize = end-v+1+1; // one additional byte needed
+        reserve(remainingSize);
+        _buf[_size++] = '\\';
+        _buf[_size++] = s;
+        assert(_size <= _capacity);
+        break;
+    }
+
+    ++v;
+  }
+
+  _buf[_size++] = '"';
+  assert(_size <= _capacity);
+  return *this;
+}
+
+#if JSONT_CXX_RVALUE_REFS
+  // Move constructor and assignment operator
+  Builder::Builder(Builder&& other)
+      : _buf(other._buf)
+      , _capacity(other._capacity)
+      , _size(other._size)
+      , _state(other._state) {
+    other._buf = 0;
+  }
+
+  Builder& Builder::operator=(Builder&& other) {
+    _buf = other._buf; other._buf = 0;
+    _capacity = other._capacity;
+    _size = other._size;
+    _state = other._state;
+    return *this;
+  }
+#endif
+
+Builder::Builder(const Builder& other)
+    : _buf(0)
+    , _capacity(other._capacity)
+    , _size(other._size)
+    , _state(other._state) {
+  _buf = (char*)malloc(_capacity);
+  memcpy((void*)_buf, (const void*)other._buf, _size);
+}
+
+Builder& Builder::operator=(const Builder& other) {
+  _capacity = other._capacity;
+  _size = other._size;
+  _state = other._state;
+  _buf = (char*)malloc(_capacity);
+  memcpy((void*)_buf, (const void*)other._buf, _size);
+  return *this;
+}
+
+} // namespace jsont
commit	f417eaf93c086c0695adeb0e9cacd44e7e537b6a	[log] [tgz]
author	Austin Schuh <austin.linux@gmail.com>	Mon Sep 16 21:58:36 2019 -0700
committer	Austin Schuh <austin.linux@gmail.com>	Mon Sep 16 21:58:36 2019 -0700
tree	1b9c3b952bf1501aacc99fbfd40e7a2c730c1b2d