Blame - aos/json_tokenizer.cc - RealtimeRoboticsGroup/test

blob: b3c66200c6b2285de5fd5cbf171d977d2edb259f [file] [log] [blame]

Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	1	#include "aos/json_tokenizer.h"
				2
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	3	#include <cerrno>
Brian Silverman	4c7235a	2021-11-17 19:04:37 -0800	[diff] [blame]	4	#include <limits>
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	5
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	6	namespace aos {
				7
				8	void Tokenizer::ConsumeWhitespace() {
				9	while (true) {
				10	if (AtEnd()) {
				11	return;
				12	}
				13	// Skip any whitespace.
				14	if (Char() == ' ' \|\| Char() == '\r' \|\| Char() == '\t') {
				15	ConsumeChar();
				16	} else if (Char() == '\n') {
				17	ConsumeChar();
				18	++linenumber_;
Austin Schuh	81da4b2	2019-10-06 14:03:24 -0700	[diff] [blame]	19	} else if (Consume("/*")) {
				20	while (!Consume("*/")) {
				21	if (Char() == '\n') {
				22	++linenumber_;
				23	}
				24	ConsumeChar();
				25	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	26	} else {
				27	// There is no fail. Once we are out of whitespace (including 0 of it),
				28	// declare success.
				29	return;
				30	}
				31	}
				32	}
				33
				34	bool Tokenizer::Consume(const char *token) {
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	35	const std::string_view original = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	36	while (true) {
				37	// Finishing the token is success.
				38	if (*token == '\0') {
				39	return true;
				40	}
				41
				42	// But finishing the data first is failure.
				43	if (AtEnd()) {
				44	data_ = original;
				45	return false;
				46	}
				47
				48	// Missmatch is failure.
				49	if (*token != Char()) {
				50	data_ = original;
				51	return false;
				52	}
				53
				54	ConsumeChar();
				55	++token;
				56	}
				57	}
				58
				59	bool Tokenizer::ConsumeString(::std::string *s) {
				60	// Under no conditions is it acceptible to run out of data while parsing a
				61	// string. Any AtEnd checks should confirm that.
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	62	const std::string_view original = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	63	if (AtEnd()) {
				64	return false;
				65	}
				66
				67	// Expect the leading "
				68	if (Char() != '"') {
				69	return false;
				70	}
				71
				72	ConsumeChar();
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	73	std::string_view last_parsed_data = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	74	*s = ::std::string();
				75
				76	while (true) {
				77	if (AtEnd()) {
				78	data_ = original;
				79	return false;
				80	}
				81
				82	// If we get an end or an escape, do something special.
				83	if (Char() == '"' \|\| Char() == '\\') {
				84	// Save what we found up until now, not including this character.
				85	*s += ::std::string(
				86	last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
				87
				88	// Update the pointer.
				89	last_parsed_data = data_;
				90
				91	// " is the end, declare victory.
				92	if (Char() == '"') {
				93	ConsumeChar();
Pallavi Madhukar	e2eb281	2022-07-19 09:56:09 -0700	[diff] [blame^]	94	if (unicode_high_surrogate_ != -1) {
				95	fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
				96	data_ = original;
				97	return false;
				98	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	99	return true;
				100	} else {
				101	ConsumeChar();
				102	// Now consume valid escape characters and add their representation onto
				103	// the output string.
				104	if (AtEnd()) {
				105	data_ = original;
				106	return false;
				107	} else if (Char() == '"') {
				108	*s += "\"";
				109	} else if (Char() == '\\') {
				110	*s += "\\";
				111	} else if (Char() == '/') {
				112	*s += "/";
				113	} else if (Char() == 'b') {
				114	*s += "\b";
				115	} else if (Char() == 'f') {
				116	*s += "\f";
				117	} else if (Char() == 'n') {
				118	*s += "\n";
				119	} else if (Char() == 'r') {
				120	*s += "\r";
				121	} else if (Char() == 't') {
				122	*s += "\t";
				123	} else if (Char() == 'u') {
Pallavi Madhukar	e2eb281	2022-07-19 09:56:09 -0700	[diff] [blame^]	124	if (!ConsumeUnicode(s)) {
				125	fprintf(stderr, "Invalid unicode on line %d\n", linenumber_);
				126	data_ = original;
				127	return false;
				128	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	129	}
				130	}
				131	// And skip the escaped character.
				132	last_parsed_data = data_.substr(1);
				133	}
				134
				135	ConsumeChar();
				136	}
				137	}
				138
Pallavi Madhukar	e2eb281	2022-07-19 09:56:09 -0700	[diff] [blame^]	139	bool Tokenizer::ConsumeUnicode(::std::string *s) {
				140	// Under no conditions is it acceptible to run out of data while parsing a
				141	// unicode. Any AtEnd checks should confirm that.
				142	uint32_t val;
				143
				144	// Consume unicode representation
				145	ConsumeChar();
				146
				147	char target[5];
				148
				149	// Valid unicode is 4 hex digits so evaluate the next 4 characters
				150	for (int count = 0; count < 4; count++) {
				151	// If there is no data or data is an invalid char, return false
				152	if (AtEnd()) {
				153	return false;
				154	}
				155
				156	if (!isxdigit(Char())) {
				157	return false;
				158	}
				159
				160	target[count] = Char();
				161
				162	// Do not consume the last character
				163	if (count == 3) {
				164	break;
				165	}
				166
				167	ConsumeChar();
				168	}
				169	target[4] = '\0';
				170
				171	// References: flatbuffers/src/idl_parser.cpp
				172	val = flatbuffers::StringToUInt(target, 16);
				173
				174	if (val >= 0xD800 && val <= 0xDBFF) {
				175	if (unicode_high_surrogate_ != -1) {
				176	fprintf(stderr, "Invalid unicode - Multiple high surrogates\n");
				177	return false;
				178	} else {
				179	unicode_high_surrogate_ = static_cast<int>(val);
				180	}
				181	} else if (val >= 0xDC00 && val <= 0xDFFF) {
				182	if (unicode_high_surrogate_ == -1) {
				183	fprintf(stderr, "Invalid unicode - Unpaired low surrogate\n");
				184	return false;
				185	} else {
				186	int code_point =
				187	0x10000 + ((unicode_high_surrogate_ & 0x03FF) << 10) + (val & 0x03FF);
				188	flatbuffers::ToUTF8(code_point, s);
				189	unicode_high_surrogate_ = -1;
				190	}
				191	} else {
				192	if (unicode_high_surrogate_ != -1) {
				193	fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
				194	return false;
				195	}
				196	flatbuffers::ToUTF8(static_cast<int>(val), s);
				197	}
				198	return true;
				199	}
				200
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	201	bool Tokenizer::ConsumeNumber(::std::string *s) {
				202	// Under no conditions is it acceptible to run out of data while parsing a
				203	// number. Any AtEnd() checks should confirm that.
				204	*s = ::std::string();
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	205	const std::string_view original = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	206
				207	// Consume the leading - unconditionally.
				208	Consume("-");
				209
Austin Schuh	bba0c3c	2019-11-29 22:00:34 -0800	[diff] [blame]	210	// See if we find nan. This isn't standards compliant, but is what
				211	// flatbuffers prints out, so we need to parse it.
				212	if (Consume("nan")) {
				213	*s = ::std::string(original.substr(0, original.size() - data_.size()));
				214	return true;
				215	}
				216
Brian Silverman	714b1d6	2020-04-28 16:52:54 -0700	[diff] [blame]	217	// People tend to use null instead of nan. Accept that too.
				218	if (Consume("null")) {
				219	*s = ::std::string("nan");
				220	return true;
				221	}
				222
Austin Schuh	9fa0b8e	2021-03-21 19:21:50 -0700	[diff] [blame]	223	// Inf is also acceptable.
				224	if (Consume("inf")) {
				225	*s = ::std::string(original.substr(0, original.size() - data_.size()));
				226	return true;
				227	}
				228
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	229	// Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
				230	// by a second number.
				231	if (!Consume("0")) {
				232	if (AtEnd()) {
				233	return false;
				234	} else if (Char() >= '1' && Char() <= '9') {
				235	// This wasn't a zero, but was a valid digit. Consume it.
				236	ConsumeChar();
				237	} else {
				238	return false;
				239	}
				240
				241	// Now consume any number of any digits.
				242	while (true) {
				243	if (AtEnd()) {
				244	data_ = original;
				245	return false;
				246	}
				247	if (Char() < '0' \|\| Char() > '9') {
				248	break;
				249	}
				250	ConsumeChar();
				251	}
				252	}
				253
				254	// We could now have a decimal.
				255	if (Char() == '.') {
				256	ConsumeChar();
				257	while (true) {
				258	if (AtEnd()) {
				259	data_ = original;
				260	return false;
				261	}
				262	// And any number of digits.
				263	if (Char() < '0' \|\| Char() > '9') {
				264	break;
				265	}
				266	ConsumeChar();
				267	}
				268	}
				269
				270	// And now an exponent.
				271	if (Char() == 'e' \|\| Char() == 'E') {
				272	ConsumeChar();
				273	if (AtEnd()) {
				274	data_ = original;
				275	return false;
				276	}
				277
				278	// Which could have a +-
				279	if (Char() == '+' \|\| Char() == '-') {
				280	ConsumeChar();
				281	}
				282	int count = 0;
				283	while (true) {
				284	if (AtEnd()) {
				285	data_ = original;
				286	return false;
				287	}
				288	// And digits.
				289	if (Char() < '0' \|\| Char() > '9') {
				290	break;
				291	}
				292	ConsumeChar();
				293	++count;
				294	}
				295	// But, it is an error to have an exponent and nothing following it.
				296	if (count == 0) {
				297	data_ = original;
				298	return false;
				299	}
				300	}
				301
				302	*s = ::std::string(original.substr(0, original.size() - data_.size()));
				303	return true;
				304	}
				305
				306	Tokenizer::TokenType Tokenizer::Next() {
				307	switch (state_) {
				308	case State::kExpectObjectStart:
				309	// We should always start out with a {
Austin Schuh	6f89670	2020-03-19 16:07:20 -0700	[diff] [blame]	310	if (!Consume("{")) {
				311	fprintf(stderr, "Error on line %d, expected { for start.\n",
				312	linenumber_);
				313	return TokenType::kError;
				314	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	315
				316	// Document that we just started an object.
				317	object_type_.push_back(ObjectType::kObject);
				318
				319	ConsumeWhitespace();
				320
				321	if (Consume("}")) {
				322	ConsumeWhitespace();
				323	state_ = State::kExpectObjectEnd;
				324	} else {
				325	state_ = State::kExpectField;
				326	}
				327	return TokenType::kStartObject;
				328
				329	case State::kExpectField: {
				330	// Fields are built up of strings, whitespace, and then a : (followed by
				331	// whitespace...)
				332	::std::string s;
				333	if (!ConsumeString(&s)) {
				334	fprintf(stderr, "Error on line %d, expected string for field name.\n",
				335	linenumber_);
Austin Schuh	9270084	2019-12-28 13:00:17 -0800	[diff] [blame]	336	if (Consume("}")) {
				337	fprintf(stderr,
				338	"Got '}' instead. Did you add an extra trailing ','?\n");
				339	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	340	return TokenType::kError;
				341	}
				342	field_name_ = ::std::move(s);
				343
				344	ConsumeWhitespace();
				345
				346	if (!Consume(":")) {
Austin Schuh	2595a14	2020-11-29 22:43:57 -0800	[diff] [blame]	347	fprintf(stderr, "Error on line %d, expected ':', got '%c'\n",
				348	linenumber_, Char());
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	349	return TokenType::kError;
				350	}
				351
				352	ConsumeWhitespace();
				353
				354	state_ = State::kExpectValue;
				355
				356	return TokenType::kField;
				357	} break;
				358	case State::kExpectValue: {
				359	TokenType result = TokenType::kError;
				360
				361	::std::string s;
				362	if (Consume("{")) {
				363	// Fields are in objects. Record and recurse.
				364	object_type_.push_back(ObjectType::kObject);
				365
				366	ConsumeWhitespace();
				367
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	368	// And then if we encounter the end again, go to the end state.
				369	if (Consume("}")) {
				370	ConsumeWhitespace();
				371	state_ = State::kExpectObjectEnd;
				372	} else {
				373	state_ = State::kExpectField;
				374	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	375	return TokenType::kStartObject;
				376	} else if (Consume("[")) {
				377	// Values are in arrays. Record and recurse.
				378	object_type_.push_back(ObjectType::kArray);
				379
				380	ConsumeWhitespace();
				381	state_ = State::kExpectValue;
				382	return TokenType::kStartArray;
				383	} else if (ConsumeString(&s)) {
				384	// Parsed as a string, grab it.
				385	field_value_ = ::std::move(s);
				386	result = TokenType::kStringValue;
				387	} else if (ConsumeNumber(&s)) {
				388	// Parsed as a number, grab it.
				389	field_value_ = ::std::move(s);
				390	result = TokenType::kNumberValue;
				391	} else if (Consume("true")) {
				392	// Parsed as a true, grab it.
				393	field_value_ = "true";
				394	result = TokenType::kTrueValue;
				395	} else if (Consume("false")) {
				396	// Parsed as a false, grab it.
				397	field_value_ = "false";
				398	result = TokenType::kFalseValue;
				399	} else {
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	400	switch (object_type_.back()) {
				401	case ObjectType::kObject:
				402	if (Consume("}")) {
				403	ConsumeWhitespace();
				404	state_ = State::kExpectObjectEnd;
				405	return Next();
				406	}
				407	break;
				408	case ObjectType::kArray:
				409	if (Consume("]")) {
				410	ConsumeWhitespace();
				411	state_ = State::kExpectArrayEnd;
				412	return Next();
				413	}
				414	break;
				415	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	416	// Couldn't parse, so we have a syntax error.
				417	fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
				418	}
				419
				420	ConsumeWhitespace();
				421
				422	// After a field, we either have a , and another field (or value if we are
				423	// in an array), or we should be closing out the object (or array).
				424	if (Consume(",")) {
				425	ConsumeWhitespace();
				426	switch (object_type_.back()) {
				427	case ObjectType::kObject:
				428	state_ = State::kExpectField;
				429	break;
				430	case ObjectType::kArray:
				431	state_ = State::kExpectValue;
				432	break;
				433	}
				434	} else {
				435	// Sanity check that the stack is deep enough.
				436	if (object_type_.size() == 0) {
				437	fprintf(stderr, "Error on line %d\n", linenumber_);
				438	return TokenType::kError;
				439	}
				440
				441	// And then require closing out the object.
				442	switch (object_type_.back()) {
				443	case ObjectType::kObject:
				444	if (Consume("}")) {
				445	ConsumeWhitespace();
				446	state_ = State::kExpectObjectEnd;
				447	} else {
Austin Schuh	60e7794	2022-05-16 17:48:24 -0700	[diff] [blame]	448	fprintf(stderr, "Error on line %d, expected } or ,\n",
				449	linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	450	return TokenType::kError;
				451	}
				452	break;
				453	case ObjectType::kArray:
				454	if (Consume("]")) {
				455	ConsumeWhitespace();
				456	state_ = State::kExpectArrayEnd;
				457	} else {
Austin Schuh	60e7794	2022-05-16 17:48:24 -0700	[diff] [blame]	458	fprintf(stderr, "Error on line %d, expected ] or ,\n",
				459	linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	460	return TokenType::kError;
				461	}
				462	break;
				463	}
				464	}
				465	return result;
				466	} break;
				467
				468	case State::kExpectArrayEnd:
				469	case State::kExpectObjectEnd: {
				470	const TokenType result = state_ == State::kExpectArrayEnd
				471	? TokenType::kEndArray
				472	: TokenType::kEndObject;
				473	// This is a transient state so we can send 2 tokens out in a row. We
				474	// discover the object or array end at the end of reading the value.
				475	object_type_.pop_back();
				476	if (object_type_.size() == 0) {
				477	// We unwound the outer object. We should send kEnd next.
				478	state_ = State::kExpectEnd;
				479	} else if (object_type_.back() == ObjectType::kObject) {
				480	// If we are going into an object, it should either have another field
				481	// or end.
				482	if (Consume(",")) {
				483	ConsumeWhitespace();
				484	state_ = State::kExpectField;
				485	} else if (Consume("}")) {
				486	ConsumeWhitespace();
				487	state_ = State::kExpectObjectEnd;
				488	} else {
Austin Schuh	217a978	2019-12-21 23:02:50 -0800	[diff] [blame]	489	fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	490	return TokenType::kError;
				491	}
				492	} else if (object_type_.back() == ObjectType::kArray) {
				493	// If we are going into an array, it should either have another value
				494	// or end.
				495	if (Consume(",")) {
				496	ConsumeWhitespace();
				497	state_ = State::kExpectValue;
				498	} else if (Consume("]")) {
				499	ConsumeWhitespace();
				500	state_ = State::kExpectArrayEnd;
				501	} else {
Austin Schuh	217a978	2019-12-21 23:02:50 -0800	[diff] [blame]	502	fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	503	return TokenType::kError;
				504	}
				505	}
				506	// And then send out the correct token.
				507	return result;
				508	}
				509	case State::kExpectEnd:
				510	// If we are supposed to be done, confirm nothing is after the end.
				511	if (AtEnd()) {
				512	return TokenType::kEnd;
				513	} else {
				514	fprintf(stderr, "Data past end at line %d\n", linenumber_);
				515	return TokenType::kError;
				516	}
				517	}
				518	return TokenType::kError;
				519	}
				520
				521	bool Tokenizer::FieldAsInt(long long *value) {
				522	const char *pos = field_value().c_str();
				523	errno = 0;
				524	value = strtoll(field_value().c_str(), const_cast<char *>(&pos), 10);
				525	if (pos != field_value().c_str() + field_value().size() \|\| errno != 0) {
				526	return false;
				527	}
				528	return true;
				529	}
				530
				531	bool Tokenizer::FieldAsDouble(double *value) {
				532	const char *pos = field_value().c_str();
				533	errno = 0;
Austin Schuh	bba0c3c	2019-11-29 22:00:34 -0800	[diff] [blame]	534	if (field_value() == "nan") {
				535	*value = std::numeric_limits<double>::quiet_NaN();
				536	return true;
				537	} else if (field_value() == "-nan") {
				538	*value = -std::numeric_limits<double>::quiet_NaN();
				539	return true;
				540	}
				541
Austin Schuh	9fa0b8e	2021-03-21 19:21:50 -0700	[diff] [blame]	542	if (field_value() == "inf") {
				543	*value = std::numeric_limits<double>::infinity();
				544	return true;
				545	} else if (field_value() == "-inf") {
				546	*value = -std::numeric_limits<double>::infinity();
				547	return true;
				548	}
				549
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	550	value = strtod(field_value().c_str(), const_cast<char *>(&pos));
				551
				552	if (pos != field_value().c_str() + field_value().size() \|\| errno != 0) {
				553	return false;
				554	}
				555	return true;
				556	}
				557
				558	} // namespace aos