Blame - aos/json_tokenizer.cc - RealtimeRoboticsGroup/test

blob: eab7fccededf9b9082fd53073102d720e89ffcd3 [file] [log] [blame]

Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	1	#include "aos/json_tokenizer.h"
				2
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	3	#include <cerrno>
Brian Silverman	4c7235a	2021-11-17 19:04:37 -0800	[diff] [blame]	4	#include <limits>
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	5
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	6	namespace aos {
				7
				8	void Tokenizer::ConsumeWhitespace() {
				9	while (true) {
				10	if (AtEnd()) {
				11	return;
				12	}
				13	// Skip any whitespace.
				14	if (Char() == ' ' \|\| Char() == '\r' \|\| Char() == '\t') {
				15	ConsumeChar();
				16	} else if (Char() == '\n') {
				17	ConsumeChar();
				18	++linenumber_;
Austin Schuh	81da4b2	2019-10-06 14:03:24 -0700	[diff] [blame]	19	} else if (Consume("/*")) {
				20	while (!Consume("*/")) {
				21	if (Char() == '\n') {
				22	++linenumber_;
				23	}
				24	ConsumeChar();
				25	}
Stephan Pleines	8983685	2023-09-15 20:11:57 -0700	[diff] [blame]	26	} else if (Consume("//")) {
				27	// C++ style comment. Keep consuming chars until newline, or until the
				28	// end of the file if this is the last line (no newline at end of file).
				29	while (true) {
				30	ConsumeChar();
				31	if (AtEnd()) {
				32	return;
				33	}
				34	if (Char() == '\n') {
				35	++linenumber_;
				36	break;
				37	}
				38	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	39	} else {
				40	// There is no fail. Once we are out of whitespace (including 0 of it),
				41	// declare success.
				42	return;
				43	}
				44	}
				45	}
				46
				47	bool Tokenizer::Consume(const char *token) {
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	48	const std::string_view original = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	49	while (true) {
				50	// Finishing the token is success.
				51	if (*token == '\0') {
				52	return true;
				53	}
				54
				55	// But finishing the data first is failure.
				56	if (AtEnd()) {
				57	data_ = original;
				58	return false;
				59	}
				60
				61	// Missmatch is failure.
				62	if (*token != Char()) {
				63	data_ = original;
				64	return false;
				65	}
				66
				67	ConsumeChar();
				68	++token;
				69	}
				70	}
				71
				72	bool Tokenizer::ConsumeString(::std::string *s) {
				73	// Under no conditions is it acceptible to run out of data while parsing a
				74	// string. Any AtEnd checks should confirm that.
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	75	const std::string_view original = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	76	if (AtEnd()) {
				77	return false;
				78	}
				79
				80	// Expect the leading "
				81	if (Char() != '"') {
				82	return false;
				83	}
				84
				85	ConsumeChar();
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	86	std::string_view last_parsed_data = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	87	*s = ::std::string();
				88
				89	while (true) {
				90	if (AtEnd()) {
				91	data_ = original;
				92	return false;
				93	}
				94
				95	// If we get an end or an escape, do something special.
				96	if (Char() == '"' \|\| Char() == '\\') {
				97	// Save what we found up until now, not including this character.
				98	*s += ::std::string(
				99	last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
				100
				101	// Update the pointer.
				102	last_parsed_data = data_;
				103
				104	// " is the end, declare victory.
				105	if (Char() == '"') {
				106	ConsumeChar();
Pallavi Madhukar	e2eb281	2022-07-19 09:56:09 -0700	[diff] [blame]	107	if (unicode_high_surrogate_ != -1) {
				108	fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
				109	data_ = original;
				110	return false;
				111	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	112	return true;
				113	} else {
				114	ConsumeChar();
				115	// Now consume valid escape characters and add their representation onto
				116	// the output string.
				117	if (AtEnd()) {
				118	data_ = original;
				119	return false;
				120	} else if (Char() == '"') {
				121	*s += "\"";
				122	} else if (Char() == '\\') {
				123	*s += "\\";
				124	} else if (Char() == '/') {
				125	*s += "/";
				126	} else if (Char() == 'b') {
				127	*s += "\b";
				128	} else if (Char() == 'f') {
				129	*s += "\f";
				130	} else if (Char() == 'n') {
				131	*s += "\n";
				132	} else if (Char() == 'r') {
				133	*s += "\r";
				134	} else if (Char() == 't') {
				135	*s += "\t";
				136	} else if (Char() == 'u') {
Pallavi Madhukar	e2eb281	2022-07-19 09:56:09 -0700	[diff] [blame]	137	if (!ConsumeUnicode(s)) {
				138	fprintf(stderr, "Invalid unicode on line %d\n", linenumber_);
				139	data_ = original;
				140	return false;
				141	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	142	}
				143	}
				144	// And skip the escaped character.
				145	last_parsed_data = data_.substr(1);
				146	}
				147
				148	ConsumeChar();
				149	}
				150	}
				151
Pallavi Madhukar	e2eb281	2022-07-19 09:56:09 -0700	[diff] [blame]	152	bool Tokenizer::ConsumeUnicode(::std::string *s) {
				153	// Under no conditions is it acceptible to run out of data while parsing a
				154	// unicode. Any AtEnd checks should confirm that.
				155	uint32_t val;
				156
				157	// Consume unicode representation
				158	ConsumeChar();
				159
				160	char target[5];
				161
				162	// Valid unicode is 4 hex digits so evaluate the next 4 characters
				163	for (int count = 0; count < 4; count++) {
				164	// If there is no data or data is an invalid char, return false
				165	if (AtEnd()) {
				166	return false;
				167	}
				168
				169	if (!isxdigit(Char())) {
				170	return false;
				171	}
				172
				173	target[count] = Char();
				174
				175	// Do not consume the last character
				176	if (count == 3) {
				177	break;
				178	}
				179
				180	ConsumeChar();
				181	}
				182	target[4] = '\0';
				183
				184	// References: flatbuffers/src/idl_parser.cpp
				185	val = flatbuffers::StringToUInt(target, 16);
				186
				187	if (val >= 0xD800 && val <= 0xDBFF) {
				188	if (unicode_high_surrogate_ != -1) {
				189	fprintf(stderr, "Invalid unicode - Multiple high surrogates\n");
				190	return false;
				191	} else {
				192	unicode_high_surrogate_ = static_cast<int>(val);
				193	}
				194	} else if (val >= 0xDC00 && val <= 0xDFFF) {
				195	if (unicode_high_surrogate_ == -1) {
				196	fprintf(stderr, "Invalid unicode - Unpaired low surrogate\n");
				197	return false;
				198	} else {
				199	int code_point =
				200	0x10000 + ((unicode_high_surrogate_ & 0x03FF) << 10) + (val & 0x03FF);
				201	flatbuffers::ToUTF8(code_point, s);
				202	unicode_high_surrogate_ = -1;
				203	}
				204	} else {
				205	if (unicode_high_surrogate_ != -1) {
				206	fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
				207	return false;
				208	}
				209	flatbuffers::ToUTF8(static_cast<int>(val), s);
				210	}
				211	return true;
				212	}
				213
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	214	bool Tokenizer::ConsumeNumber(::std::string *s) {
				215	// Under no conditions is it acceptible to run out of data while parsing a
				216	// number. Any AtEnd() checks should confirm that.
				217	*s = ::std::string();
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	218	const std::string_view original = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	219
				220	// Consume the leading - unconditionally.
				221	Consume("-");
				222
Austin Schuh	bba0c3c	2019-11-29 22:00:34 -0800	[diff] [blame]	223	// See if we find nan. This isn't standards compliant, but is what
				224	// flatbuffers prints out, so we need to parse it.
				225	if (Consume("nan")) {
				226	*s = ::std::string(original.substr(0, original.size() - data_.size()));
				227	return true;
				228	}
				229
Brian Silverman	714b1d6	2020-04-28 16:52:54 -0700	[diff] [blame]	230	// People tend to use null instead of nan. Accept that too.
				231	if (Consume("null")) {
				232	*s = ::std::string("nan");
				233	return true;
				234	}
				235
Austin Schuh	9fa0b8e	2021-03-21 19:21:50 -0700	[diff] [blame]	236	// Inf is also acceptable.
				237	if (Consume("inf")) {
				238	*s = ::std::string(original.substr(0, original.size() - data_.size()));
				239	return true;
				240	}
				241
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	242	// Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
				243	// by a second number.
				244	if (!Consume("0")) {
				245	if (AtEnd()) {
				246	return false;
				247	} else if (Char() >= '1' && Char() <= '9') {
				248	// This wasn't a zero, but was a valid digit. Consume it.
				249	ConsumeChar();
				250	} else {
				251	return false;
				252	}
				253
				254	// Now consume any number of any digits.
				255	while (true) {
				256	if (AtEnd()) {
				257	data_ = original;
				258	return false;
				259	}
				260	if (Char() < '0' \|\| Char() > '9') {
				261	break;
				262	}
				263	ConsumeChar();
				264	}
				265	}
				266
				267	// We could now have a decimal.
				268	if (Char() == '.') {
				269	ConsumeChar();
				270	while (true) {
				271	if (AtEnd()) {
				272	data_ = original;
				273	return false;
				274	}
				275	// And any number of digits.
				276	if (Char() < '0' \|\| Char() > '9') {
				277	break;
				278	}
				279	ConsumeChar();
				280	}
				281	}
				282
				283	// And now an exponent.
				284	if (Char() == 'e' \|\| Char() == 'E') {
				285	ConsumeChar();
				286	if (AtEnd()) {
				287	data_ = original;
				288	return false;
				289	}
				290
				291	// Which could have a +-
				292	if (Char() == '+' \|\| Char() == '-') {
				293	ConsumeChar();
				294	}
				295	int count = 0;
				296	while (true) {
				297	if (AtEnd()) {
				298	data_ = original;
				299	return false;
				300	}
				301	// And digits.
				302	if (Char() < '0' \|\| Char() > '9') {
				303	break;
				304	}
				305	ConsumeChar();
				306	++count;
				307	}
				308	// But, it is an error to have an exponent and nothing following it.
				309	if (count == 0) {
				310	data_ = original;
				311	return false;
				312	}
				313	}
				314
				315	*s = ::std::string(original.substr(0, original.size() - data_.size()));
				316	return true;
				317	}
				318
				319	Tokenizer::TokenType Tokenizer::Next() {
				320	switch (state_) {
				321	case State::kExpectObjectStart:
				322	// We should always start out with a {
Austin Schuh	6f89670	2020-03-19 16:07:20 -0700	[diff] [blame]	323	if (!Consume("{")) {
				324	fprintf(stderr, "Error on line %d, expected { for start.\n",
				325	linenumber_);
				326	return TokenType::kError;
				327	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	328
				329	// Document that we just started an object.
				330	object_type_.push_back(ObjectType::kObject);
				331
				332	ConsumeWhitespace();
				333
				334	if (Consume("}")) {
				335	ConsumeWhitespace();
				336	state_ = State::kExpectObjectEnd;
				337	} else {
				338	state_ = State::kExpectField;
				339	}
				340	return TokenType::kStartObject;
				341
				342	case State::kExpectField: {
				343	// Fields are built up of strings, whitespace, and then a : (followed by
				344	// whitespace...)
				345	::std::string s;
				346	if (!ConsumeString(&s)) {
				347	fprintf(stderr, "Error on line %d, expected string for field name.\n",
				348	linenumber_);
Austin Schuh	9270084	2019-12-28 13:00:17 -0800	[diff] [blame]	349	if (Consume("}")) {
				350	fprintf(stderr,
				351	"Got '}' instead. Did you add an extra trailing ','?\n");
				352	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	353	return TokenType::kError;
				354	}
				355	field_name_ = ::std::move(s);
				356
				357	ConsumeWhitespace();
				358
				359	if (!Consume(":")) {
Austin Schuh	2595a14	2020-11-29 22:43:57 -0800	[diff] [blame]	360	fprintf(stderr, "Error on line %d, expected ':', got '%c'\n",
				361	linenumber_, Char());
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	362	return TokenType::kError;
				363	}
				364
				365	ConsumeWhitespace();
				366
				367	state_ = State::kExpectValue;
				368
				369	return TokenType::kField;
				370	} break;
				371	case State::kExpectValue: {
				372	TokenType result = TokenType::kError;
				373
				374	::std::string s;
				375	if (Consume("{")) {
				376	// Fields are in objects. Record and recurse.
				377	object_type_.push_back(ObjectType::kObject);
				378
				379	ConsumeWhitespace();
				380
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	381	// And then if we encounter the end again, go to the end state.
				382	if (Consume("}")) {
				383	ConsumeWhitespace();
				384	state_ = State::kExpectObjectEnd;
				385	} else {
				386	state_ = State::kExpectField;
				387	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	388	return TokenType::kStartObject;
				389	} else if (Consume("[")) {
				390	// Values are in arrays. Record and recurse.
				391	object_type_.push_back(ObjectType::kArray);
				392
				393	ConsumeWhitespace();
				394	state_ = State::kExpectValue;
				395	return TokenType::kStartArray;
				396	} else if (ConsumeString(&s)) {
				397	// Parsed as a string, grab it.
				398	field_value_ = ::std::move(s);
				399	result = TokenType::kStringValue;
				400	} else if (ConsumeNumber(&s)) {
				401	// Parsed as a number, grab it.
				402	field_value_ = ::std::move(s);
				403	result = TokenType::kNumberValue;
				404	} else if (Consume("true")) {
				405	// Parsed as a true, grab it.
				406	field_value_ = "true";
				407	result = TokenType::kTrueValue;
				408	} else if (Consume("false")) {
				409	// Parsed as a false, grab it.
				410	field_value_ = "false";
				411	result = TokenType::kFalseValue;
				412	} else {
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	413	switch (object_type_.back()) {
				414	case ObjectType::kObject:
				415	if (Consume("}")) {
				416	ConsumeWhitespace();
				417	state_ = State::kExpectObjectEnd;
				418	return Next();
				419	}
				420	break;
				421	case ObjectType::kArray:
				422	if (Consume("]")) {
				423	ConsumeWhitespace();
				424	state_ = State::kExpectArrayEnd;
				425	return Next();
				426	}
				427	break;
				428	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	429	// Couldn't parse, so we have a syntax error.
				430	fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
				431	}
				432
				433	ConsumeWhitespace();
				434
				435	// After a field, we either have a , and another field (or value if we are
				436	// in an array), or we should be closing out the object (or array).
				437	if (Consume(",")) {
				438	ConsumeWhitespace();
				439	switch (object_type_.back()) {
				440	case ObjectType::kObject:
				441	state_ = State::kExpectField;
				442	break;
				443	case ObjectType::kArray:
				444	state_ = State::kExpectValue;
				445	break;
				446	}
				447	} else {
				448	// Sanity check that the stack is deep enough.
				449	if (object_type_.size() == 0) {
				450	fprintf(stderr, "Error on line %d\n", linenumber_);
				451	return TokenType::kError;
				452	}
				453
				454	// And then require closing out the object.
				455	switch (object_type_.back()) {
				456	case ObjectType::kObject:
				457	if (Consume("}")) {
				458	ConsumeWhitespace();
				459	state_ = State::kExpectObjectEnd;
				460	} else {
Austin Schuh	60e7794	2022-05-16 17:48:24 -0700	[diff] [blame]	461	fprintf(stderr, "Error on line %d, expected } or ,\n",
				462	linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	463	return TokenType::kError;
				464	}
				465	break;
				466	case ObjectType::kArray:
				467	if (Consume("]")) {
				468	ConsumeWhitespace();
				469	state_ = State::kExpectArrayEnd;
				470	} else {
Austin Schuh	60e7794	2022-05-16 17:48:24 -0700	[diff] [blame]	471	fprintf(stderr, "Error on line %d, expected ] or ,\n",
				472	linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	473	return TokenType::kError;
				474	}
				475	break;
				476	}
				477	}
				478	return result;
				479	} break;
				480
				481	case State::kExpectArrayEnd:
				482	case State::kExpectObjectEnd: {
				483	const TokenType result = state_ == State::kExpectArrayEnd
				484	? TokenType::kEndArray
				485	: TokenType::kEndObject;
				486	// This is a transient state so we can send 2 tokens out in a row. We
				487	// discover the object or array end at the end of reading the value.
				488	object_type_.pop_back();
				489	if (object_type_.size() == 0) {
				490	// We unwound the outer object. We should send kEnd next.
				491	state_ = State::kExpectEnd;
				492	} else if (object_type_.back() == ObjectType::kObject) {
				493	// If we are going into an object, it should either have another field
				494	// or end.
				495	if (Consume(",")) {
				496	ConsumeWhitespace();
				497	state_ = State::kExpectField;
				498	} else if (Consume("}")) {
				499	ConsumeWhitespace();
				500	state_ = State::kExpectObjectEnd;
				501	} else {
Austin Schuh	217a978	2019-12-21 23:02:50 -0800	[diff] [blame]	502	fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	503	return TokenType::kError;
				504	}
				505	} else if (object_type_.back() == ObjectType::kArray) {
				506	// If we are going into an array, it should either have another value
				507	// or end.
				508	if (Consume(",")) {
				509	ConsumeWhitespace();
				510	state_ = State::kExpectValue;
				511	} else if (Consume("]")) {
				512	ConsumeWhitespace();
				513	state_ = State::kExpectArrayEnd;
				514	} else {
Austin Schuh	217a978	2019-12-21 23:02:50 -0800	[diff] [blame]	515	fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	516	return TokenType::kError;
				517	}
				518	}
				519	// And then send out the correct token.
				520	return result;
				521	}
				522	case State::kExpectEnd:
				523	// If we are supposed to be done, confirm nothing is after the end.
				524	if (AtEnd()) {
				525	return TokenType::kEnd;
				526	} else {
				527	fprintf(stderr, "Data past end at line %d\n", linenumber_);
				528	return TokenType::kError;
				529	}
				530	}
				531	return TokenType::kError;
				532	}
				533
				534	bool Tokenizer::FieldAsInt(long long *value) {
				535	const char *pos = field_value().c_str();
				536	errno = 0;
				537	value = strtoll(field_value().c_str(), const_cast<char *>(&pos), 10);
				538	if (pos != field_value().c_str() + field_value().size() \|\| errno != 0) {
				539	return false;
				540	}
				541	return true;
				542	}
				543
				544	bool Tokenizer::FieldAsDouble(double *value) {
				545	const char *pos = field_value().c_str();
				546	errno = 0;
Austin Schuh	bba0c3c	2019-11-29 22:00:34 -0800	[diff] [blame]	547	if (field_value() == "nan") {
				548	*value = std::numeric_limits<double>::quiet_NaN();
				549	return true;
				550	} else if (field_value() == "-nan") {
				551	*value = -std::numeric_limits<double>::quiet_NaN();
				552	return true;
				553	}
				554
Austin Schuh	9fa0b8e	2021-03-21 19:21:50 -0700	[diff] [blame]	555	if (field_value() == "inf") {
				556	*value = std::numeric_limits<double>::infinity();
				557	return true;
				558	} else if (field_value() == "-inf") {
				559	*value = -std::numeric_limits<double>::infinity();
				560	return true;
				561	}
				562
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	563	value = strtod(field_value().c_str(), const_cast<char *>(&pos));
				564
				565	if (pos != field_value().c_str() + field_value().size() \|\| errno != 0) {
				566	return false;
				567	}
				568	return true;
				569	}
				570
				571	} // namespace aos