Blame - third_party/jsont/jsont.cc - RealtimeRoboticsGroup/test

blob: 09b1e4567d24869a9e7035e1a6da6bb4de616c15 [file] [log] [blame]

Austin Schuh	f417eaf	2019-09-16 21:58:36 -0700	[diff] [blame]	1	#include "jsont.hh"
				2
				3	namespace jsont {
				4
				5	static const int8_t kHexValueTable[55] = {
				6	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 0-0
				7	-1, -1, -1, -1, -1, -1, -1,
				8	10, 11, 12, 13, 14, 15, // A-F
				9	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				10	-1, -1, -1, -1, -1, -1, -1,
				11	10, 11, 12, 13, 14, 15 // a-f
				12	};
				13
				14	static uint64_t _xtou64(const uint8_t* bytes, size_t len) {
				15	uint64_t value = 0;
				16	uint64_t cutoff = UINT64_MAX / 16;
				17	int cutoff_digit = (int)(UINT64_MAX - cutoff * 16);
				18
				19	for (size_t i = 0; i != len; ++i) {
				20	uint8_t b = bytes[i];
				21	int8_t digit = (b > '0'-1 && b < 'f'+1) ? kHexValueTable[b-'0'] : -1;
				22	if (b == -1 \|\| // bad digit
				23	(value > cutoff) \|\| // overflow
				24	((value == cutoff) && (digit > cutoff_digit)) ) {
				25	return UINT64_MAX;
				26	} else {
				27	value = (value * 16) + digit;
				28	}
				29	}
				30
				31	return value;
				32	}
				33
				34
				35	#ifdef NAN
				36	#define _JSONT_NAN NAN
				37	#else
				38	#define _JSONT_NAN nan(0)
				39	#endif
				40
				41
				42	const char* token_name(jsont::Token tok) {
				43	switch (tok) {
				44	case End: return "End";
				45	case ObjectStart: return "ObjectStart";
				46	case ObjectEnd: return "ObjectEnd";
				47	case ArrayStart: return "ArrayStart";
				48	case ArrayEnd: return "ArrayEnd";
				49	case True: return "True";
				50	case False: return "False";
				51	case Null: return "Null";
				52	case Integer: return "Integer";
				53	case Float: return "Float";
				54	case String: return "String";
				55	case FieldName: return "FieldName";
				56	default: return "?";
				57	}
				58	}
				59
				60
				61	class TokenizerInternal {
				62	public:
				63	inline static const uint8_t* currentInput(const Tokenizer& self) {
				64	return self._input.bytes + self._input.offset;
				65	}
				66
				67	inline static const Token& readAtom(Tokenizer& self, const char* str,
				68	size_t len, const Token& token) {
				69	if (self.availableInput() < len) {
				70	return self.setError(Tokenizer::PrematureEndOfInput);
				71	} else if (memcmp(currentInput(self), str, len) != 0) {
				72	return self.setError(Tokenizer::InvalidByte);
				73	} else {
				74	self._input.offset += len;
				75	return self.setToken(token);
				76	}
				77	}
				78	};
				79
				80
				81	Tokenizer::~Tokenizer() {}
				82
				83
				84	void Tokenizer::reset(const char* bytes, size_t length, TextEncoding encoding) {
				85	assert(encoding == UTF8TextEncoding); // only supported encoding
				86	_input.bytes = (const uint8_t*)bytes;
				87	_input.length = length;
				88	_input.offset = 0;
				89	_error.code = UnspecifiedError;
				90	// Advance to first token
				91	next();
				92	}
				93
				94
				95	const char* Tokenizer::errorMessage() const {
				96	switch (_error.code) {
				97	case UnexpectedComma:
				98	return "Unexpected comma";
				99	case UnexpectedTrailingComma:
				100	return "Unexpected trailing comma";
				101	case InvalidByte:
				102	return "Invalid input byte";
				103	case PrematureEndOfInput:
				104	return "Premature end of input";
				105	case MalformedUnicodeEscapeSequence:
				106	return "Malformed Unicode escape sequence";
				107	case MalformedNumberLiteral:
				108	return "Malformed number literal";
				109	case UnterminatedString:
				110	return "Unterminated string";
				111	case SyntaxError:
				112	return "Illegal JSON (syntax error)";
				113	default:
				114	return "Unspecified error";
				115	}
				116	}
				117
				118
				119	size_t Tokenizer::dataValue(const char const** bytes) const {
				120	if (!hasValue()) { return 0; }
				121	if (_value.buffered) {
				122	bytes = (const char const)_value.buffer.data();
				123	return _value.buffer.size();
				124	} else {
				125	bytes = (const char const)(_input.bytes + _value.offset);
				126	return _value.length;
				127	}
				128	}
				129
				130
				131	double Tokenizer::floatValue() const {
				132	if (!hasValue()) {
				133	return _token == jsont::True ? 1.0 : 0.0;
				134	}
				135
				136	const char* bytes;
				137
				138	if (_value.buffered) {
				139	// edge-case since only happens with string values using escape sequences
				140	bytes = _value.buffer.c_str();
				141	} else {
				142	bytes = (const char*)_input.bytes + _value.offset;
				143	if (availableInput() == 0) {
				144	// In this case where the data lies at the edge of the buffer, we can't pass
				145	// it directly to atof, since there will be no sentinel byte. We are fine
				146	// with a copy, since this is an edge case (only happens either for broken
				147	// JSON or when the whole document is just a number).
				148	char* buf[128];
				149	if (_value.length > 127) {
				150	// We are unable to interpret such a large literal in this edge-case
				151	return _JSONT_NAN;
				152	}
				153	memcpy((void)buf, (const void)bytes, _value.length);
				154	buf[_value.length] = '\0';
				155	return strtod((const char)buf, (char*)0);
				156	}
				157	}
				158
				159	return strtod(bytes, (char**)0);
				160	}
				161
				162
				163	int64_t Tokenizer::intValue() const {
				164	if (!hasValue()) {
				165	return _token == jsont::True ? 1LL : 0LL;
				166	}
				167
				168	const char* bytes;
				169
				170	if (_value.buffered) {
				171	// edge-case since only happens with string values using escape sequences
				172	bytes = _value.buffer.c_str();
				173	} else {
				174	bytes = (const char*)_input.bytes + _value.offset;
				175	if (availableInput() == 0) {
				176	// In this case where the data lies at the edge of the buffer, we can't pass
				177	// it directly to atof, since there will be no sentinel byte. We are fine
				178	// with a copy, since this is an edge case (only happens either for broken
				179	// JSON or when the whole document is just a number).
				180	char* buf[21];
				181	if (_value.length > 20) {
				182	// We are unable to interpret such a large literal in this edge-case
				183	return 0;
				184	}
				185	memcpy((void)buf, (const void)bytes, _value.length);
				186	buf[_value.length] = '\0';
				187	return strtoll((const char)buf, (char*)0, 10);
				188	}
				189	}
				190
				191	return strtoll(bytes, (char**)0, 10);
				192	}
				193
				194
				195	const Token& Tokenizer::next() {
				196	//
				197	// { } [ ] n t f "
				198	// \| \| \| \|
				199	// \| \| \| +- /[^"]*/ "
				200	// \| \| +- a l s e
				201	// \| +- r u e
				202	// +- u l l
				203	//
				204	while (!endOfInput()) {
				205	uint8_t b = _input.bytes[_input.offset++];
				206	switch (b) {
				207	case '{': return setToken(ObjectStart);
				208	case '}': {
				209	if (_token == _Comma) { return setError(UnexpectedTrailingComma); }
				210	return setToken(ObjectEnd);
				211	}
				212
				213	case '[': return setToken(ArrayStart);
				214	case ']': {
				215	if (_token == _Comma) { return setError(UnexpectedTrailingComma); }
				216	return setToken(ArrayEnd);
				217	}
				218
				219	case 'n':
				220	return TokenizerInternal::readAtom(*this, "ull", 3, jsont::Null);
				221	case 't':
				222	return TokenizerInternal::readAtom(*this, "rue", 3, jsont::True);
				223	case 'f':
				224	return TokenizerInternal::readAtom(*this, "alse", 4, jsont::False);
				225
				226	case ' ': case '\t': case '\r': case '\n': // IETF RFC4627
				227	// ignore whitespace and let the outer "while" do its thing
				228	break;
				229
				230	case 0:
				231	return setError(InvalidByte);
				232
				233	// when we read a value, we don't produce a token until we either reach
				234	// end of input, a colon (then the value is a field name), a comma, or an
				235	// array or object terminator.
				236
				237	case '"': {
				238	_value.beginAtOffset(_input.offset);
				239
				240	while (!endOfInput()) {
				241	b = _input.bytes[_input.offset++];
				242	assert(_input.offset < _input.length);
				243
				244	switch (b) {
				245
				246	case '\\': {
				247	// We must go buffered since the input segment != value
				248	if (!_value.buffered) {
				249	_value.buffered = true;
				250	_value.buffer.assign(
				251	(const char*)(_input.bytes+_value.offset),
				252	_input.offset - _value.offset - 1
				253	);
				254	}
				255
				256	if (endOfInput()) {
				257	return setError(PrematureEndOfInput);
				258	}
				259
				260	b = _input.bytes[_input.offset++];
				261	switch (b) {
				262	case 'b': _value.buffer.append(1, '\x08'); break;
				263	case 'f': _value.buffer.append(1, '\x0C'); break;
				264	case 'n': _value.buffer.append(1, '\x0A'); break;
				265	case 'r': _value.buffer.append(1, '\x0D'); break;
				266	case 't': _value.buffer.append(1, '\x09'); break;
				267	case 'u': {
				268	// \uxxxx
				269	if (availableInput() < 4) {
				270	return setError(PrematureEndOfInput);
				271	}
				272
				273	uint64_t utf16cp =
				274	_xtou64(TokenizerInternal::currentInput(*this), 4);
				275	_input.offset += 4;
				276
				277	if (utf16cp > 0xffff) {
				278	return setError(MalformedUnicodeEscapeSequence);
				279	}
				280
				281	uint16_t cp = (uint16_t)(0xffff & utf16cp);
				282
				283	// Append UTF-8 byte(s) representing the Unicode codepoint cp
				284	if (cp < 0x80) {
				285	// U+0000 - U+007F
				286	uint8_t cp8 = ((uint8_t)cp);
				287	_value.buffer.append(1, (char)cp8);
				288	} else if (cp < 0x800) {
				289	// U+0080 - U+07FF
				290	uint8_t cp8 = (uint8_t)((cp >> 6) \| 0xc0);
				291	_value.buffer.append(1, (char)cp8);
				292	cp8 = (uint8_t)((cp & 0x3f) \| 0x80);
				293	_value.buffer.append(1, (char)cp8);
				294	} else if (cp >= 0xD800u && cp <= 0xDFFFu) {
				295	// UTF-16 Surrogate pairs -- according to the UTF-8
				296	// definition (RFC 3629) the high and low surrogate halves
				297	// used by UTF-16 (U+D800 through U+DFFF) are not legal
				298	// Unicode values, and the UTF-8 encoding of them is an
				299	// invalid byte sequence. Instead of throwing an error, we
				300	// substitute this character with the replacement character
				301	// U+FFFD (UTF-8: EF,BF,BD).
				302	_value.buffer.append("\xEF\xBF\xBD");
				303	//
				304	} else {
				305	// U+0800 - U+FFFF
				306	uint8_t cp8 = (uint8_t)((cp >> 12) \| 0xe0);
				307	_value.buffer.append(1, (char)cp8);
				308	cp8 = (uint8_t)(((cp >> 6) & 0x3f) \| 0x80);
				309	_value.buffer.append(1, (char)cp8);
				310	cp8 = (uint8_t)((cp & 0x3f) \| 0x80);
				311	_value.buffer.append(1, (char)cp8);
				312	}
				313
				314	break;
				315	}
				316	default:
				317	_value.buffer.append(1, (char)b); break;
				318	}
				319	break;
				320	}
				321
				322	case '"':
				323	goto after_initial_read_b;
				324
				325	case 0:
				326	return setError(InvalidByte);
				327
				328	default: {
				329	if (_value.buffered) {
				330	// TODO: Make this efficient by appending chunks between
				331	// boundaries instead of appending per-byte
				332	_value.buffer.append(1, (char)b);
				333	}
				334	break;
				335	}
				336	} // switch(b)
				337	} // while (!endOfInput())
				338
				339	after_initial_read_b:
				340	if (b != '"') {
				341	return setError(UnterminatedString);
				342	}
				343
				344	if (!_value.buffered) {
				345	_value.length = _input.offset - _value.offset - 1;
				346	}
				347
				348	// is this a field name?
				349	while (!endOfInput()) {
				350	b = _input.bytes[_input.offset++];
				351	switch (b) {
				352	case ' ': case '\t': case '\r': case '\n': break;
				353	case ':': return setToken(FieldName);
				354	case ',': goto string_read_return_string;
				355	case ']': case '}': {
				356	--_input.offset; // rewind
				357	goto string_read_return_string;
				358	}
				359	case 0: return setError(InvalidByte);
				360	default: {
				361	// Expected a comma or a colon
				362	return setError(SyntaxError);
				363	}
				364	}
				365	}
				366
				367	string_read_return_string:
				368	return setToken(jsont::String);
				369	}
				370
				371	case ',': {
				372	if (_token == ObjectStart \|\| _token == ArrayStart \|\| _token == _Comma) {
				373	return setError(UnexpectedComma);
				374	}
				375	_token = _Comma;
				376	break;
				377	}
				378
				379	default: {
				380	if (isdigit((int)b) \|\| b == '+' \|\| b == '-') {
				381	// We are reading a number
				382	_value.beginAtOffset(_input.offset-1);
				383	Token token = jsont::Integer;
				384
				385	while (!endOfInput()) {
				386	b = _input.bytes[_input.offset++];
				387	switch (b) {
				388	case '0'...'9': break;
				389	case '.': token = jsont::Float; break;
				390	case 'E': case 'e': case '-': case '+': {
				391	if (token != jsont::Float) {
				392	return setError(MalformedNumberLiteral);
				393	}
				394	break;
				395	}
				396	default: {
				397	if ( (_input.offset - _value.offset == 1) &&
				398	(_input.bytes[_value.offset] == '-' \|\|
				399	_input.bytes[_value.offset] == '+') ) {
				400	return setError(MalformedNumberLiteral);
				401	}
				402
				403	// rewind the byte that terminated this number literal
				404	--_input.offset;
				405
				406	_value.length = _input.offset - _value.offset - 1;
				407	return setToken(token);
				408	}
				409	}
				410	}
				411	return setToken(End);
				412	} else {
				413	return setError(InvalidByte);
				414	}
				415	}
				416	}
				417	}
				418
				419	return setToken(End);
				420	}
				421
				422
				423	enum {
				424	kUTF8ByteVerbatim = 0,
				425	kUTF8ByteEncode1, // "\u000x"
				426	kUTF8ByteEncode2, // "\u00xx"
				427	};
				428	#define V kUTF8ByteVerbatim
				429	#define E1 kUTF8ByteEncode1
				430	#define E2 kUTF8ByteEncode2
				431	static const uint8_t kUTF8ByteTable[256] = {
				432	E1, E1, E1, E1, E1, E1, E1, E1, 'b', 't', 'n', E1, 'f', 'r', E1, E1, E2, E2,
				433	E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, E2, V, V, '"', V, V, V, V,
				434	V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
				435	V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
				436	V, '\\', V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
				437	V, V, V, V, V, V, V, V, V, V, V, E2, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
				438	V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
				439	V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
				440	V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
				441	V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,
				442	V, V, V, V, V, V, V, V, V, V
				443	};
				444	#undef V
				445	#undef E1
				446	#undef E2
				447
				448	// #ifndef __has_feature
				449	// #define __has_feature(x) 0
				450	// #endif
				451	// #if defined(__cplusplus) && __has_feature(cxx_static_assert)
				452	// #define JSONT_CONST_ASSERT(expr, error_msg) static_assert((expr), (error_msg))
				453	// #elif __has_feature(c_static_assert)
				454	// #define JSONT_CONST_ASSERT(expr, error_msg) _Static_assert((expr), (error_msg))
				455	// #else
				456	// #define JSONT_CONST_ASSERT(expr, error_msg) ((void)0)
				457	// #endif
				458
				459	Builder& Builder::appendString(const uint8_t* v, size_t length, TextEncoding encoding) {
				460	reserve(length + 2);
				461	_buf[_size++] = '"';
				462
				463	assert(encoding == UTF8TextEncoding /* Currently only UTF-8 is supported */);
				464
				465	const uint8_t* end = v+length;
				466	while (v != end) {
				467	uint8_t s = kUTF8ByteTable[*v];
				468	switch (s) {
				469	case kUTF8ByteVerbatim:
				470	_buf[_size++] = *v;
				471	break;
				472	case kUTF8ByteEncode1: {
				473	assert(*v < 16);
				474	size_t remainingSize = end-v+1+5; // five additional bytes needed
				475	reserve(remainingSize);
				476	_buf[_size] = '\\';
				477	_buf[++_size] = 'u';
				478	_buf[++_size] = '0';
				479	_buf[++_size] = '0';
				480	_buf[++_size] = '0';
				481	_buf[++_size] = v + (v > 10 ? 55 : 48); // A-F : 0-9
				482	++_size;
				483	assert(_size <= _capacity);
				484	break;
				485	}
				486	case kUTF8ByteEncode2: {
				487	// Note: *v is guaranteed to be within the set [16,32),127. This is
				488	// an affect of the kUTF8ByteTable lookup table and this code needs to
				489	// be revised if the lookup table adds or removes any kUTF8ByteEncode.
				490	assert((v > 15 && v < 32) \|\| *v == 127);
				491	size_t remainingSize = end-v+1+5; // five additional bytes needed
				492	reserve(remainingSize);
				493	_buf[_size] = '\\';
				494	_buf[++_size] = 'u';
				495	_buf[++_size] = '0';
				496	_buf[++_size] = '0';
				497	uint8_t b1 = (*v & 0xf0) / 16;
				498	//uint8_t b1 = (*v & 0xf0) >> 4; // slightly faster but LE-specific
				499	uint8_t b2 = *v & 0x0f;
				500	_buf[++_size] = b1 + (b1 > 10 ? 55 : 48); // A-F : 0-9
				501	_buf[++_size] = b2 + (b2 > 10 ? 55 : 48); // A-F : 0-9
				502	++_size;
				503	assert(_size <= _capacity);
				504	break;
				505	}
				506	default:
				507	// reverse solidus escape
				508	size_t remainingSize = end-v+1+1; // one additional byte needed
				509	reserve(remainingSize);
				510	_buf[_size++] = '\\';
				511	_buf[_size++] = s;
				512	assert(_size <= _capacity);
				513	break;
				514	}
				515
				516	++v;
				517	}
				518
				519	_buf[_size++] = '"';
				520	assert(_size <= _capacity);
				521	return *this;
				522	}
				523
				524	#if JSONT_CXX_RVALUE_REFS
				525	// Move constructor and assignment operator
				526	Builder::Builder(Builder&& other)
				527	: _buf(other._buf)
				528	, _capacity(other._capacity)
				529	, _size(other._size)
				530	, _state(other._state) {
				531	other._buf = 0;
				532	}
				533
				534	Builder& Builder::operator=(Builder&& other) {
				535	_buf = other._buf; other._buf = 0;
				536	_capacity = other._capacity;
				537	_size = other._size;
				538	_state = other._state;
				539	return *this;
				540	}
				541	#endif
				542
				543	Builder::Builder(const Builder& other)
				544	: _buf(0)
				545	, _capacity(other._capacity)
				546	, _size(other._size)
				547	, _state(other._state) {
				548	_buf = (char*)malloc(_capacity);
				549	memcpy((void)_buf, (const void)other._buf, _size);
				550	}
				551
				552	Builder& Builder::operator=(const Builder& other) {
				553	_capacity = other._capacity;
				554	_size = other._size;
				555	_state = other._state;
				556	_buf = (char*)malloc(_capacity);
				557	memcpy((void)_buf, (const void)other._buf, _size);
				558	return *this;
				559	}
				560
				561	} // namespace jsont