Blame - aos/json_tokenizer.cc - RealtimeRoboticsGroup/test

blob: 32c92475278196e9f1419ac3f88bf0796ee40693 [file] [log] [blame]

Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	1	#include "aos/json_tokenizer.h"
				2
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	3	#include <cerrno>
Brian Silverman	4c7235a	2021-11-17 19:04:37 -0800	[diff] [blame]	4	#include <limits>
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	5
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	6	namespace aos {
				7
				8	void Tokenizer::ConsumeWhitespace() {
				9	while (true) {
				10	if (AtEnd()) {
				11	return;
				12	}
				13	// Skip any whitespace.
				14	if (Char() == ' ' \|\| Char() == '\r' \|\| Char() == '\t') {
				15	ConsumeChar();
				16	} else if (Char() == '\n') {
				17	ConsumeChar();
				18	++linenumber_;
Austin Schuh	81da4b2	2019-10-06 14:03:24 -0700	[diff] [blame]	19	} else if (Consume("/*")) {
				20	while (!Consume("*/")) {
				21	if (Char() == '\n') {
				22	++linenumber_;
				23	}
				24	ConsumeChar();
				25	}
Stephan Pleines	8983685	2023-09-15 20:11:57 -0700	[diff] [blame]	26	} else if (Consume("//")) {
				27	// C++ style comment. Keep consuming chars until newline, or until the
				28	// end of the file if this is the last line (no newline at end of file).
				29	while (true) {
Brian J Griglak	2e16e7b	2024-03-01 12:10:46 -0700	[diff] [blame]	30	// First check if we are at the end of the file.
Stephan Pleines	8983685	2023-09-15 20:11:57 -0700	[diff] [blame]	31	if (AtEnd()) {
				32	return;
				33	}
Brian J Griglak	2e16e7b	2024-03-01 12:10:46 -0700	[diff] [blame]	34	// Then check if we are at the end of the line.
Stephan Pleines	8983685	2023-09-15 20:11:57 -0700	[diff] [blame]	35	if (Char() == '\n') {
				36	++linenumber_;
				37	break;
				38	}
Brian J Griglak	2e16e7b	2024-03-01 12:10:46 -0700	[diff] [blame]	39	// Advance to next character and repeat.
				40	ConsumeChar();
Stephan Pleines	8983685	2023-09-15 20:11:57 -0700	[diff] [blame]	41	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	42	} else {
				43	// There is no fail. Once we are out of whitespace (including 0 of it),
				44	// declare success.
				45	return;
				46	}
				47	}
				48	}
				49
				50	bool Tokenizer::Consume(const char *token) {
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	51	const std::string_view original = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	52	while (true) {
				53	// Finishing the token is success.
				54	if (*token == '\0') {
				55	return true;
				56	}
				57
				58	// But finishing the data first is failure.
				59	if (AtEnd()) {
				60	data_ = original;
				61	return false;
				62	}
				63
				64	// Missmatch is failure.
				65	if (*token != Char()) {
				66	data_ = original;
				67	return false;
				68	}
				69
				70	ConsumeChar();
				71	++token;
				72	}
				73	}
				74
				75	bool Tokenizer::ConsumeString(::std::string *s) {
				76	// Under no conditions is it acceptible to run out of data while parsing a
				77	// string. Any AtEnd checks should confirm that.
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	78	const std::string_view original = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	79	if (AtEnd()) {
				80	return false;
				81	}
				82
				83	// Expect the leading "
				84	if (Char() != '"') {
				85	return false;
				86	}
				87
				88	ConsumeChar();
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	89	std::string_view last_parsed_data = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	90	*s = ::std::string();
				91
				92	while (true) {
				93	if (AtEnd()) {
				94	data_ = original;
				95	return false;
				96	}
				97
				98	// If we get an end or an escape, do something special.
				99	if (Char() == '"' \|\| Char() == '\\') {
				100	// Save what we found up until now, not including this character.
				101	*s += ::std::string(
				102	last_parsed_data.substr(0, last_parsed_data.size() - data_.size()));
				103
				104	// Update the pointer.
				105	last_parsed_data = data_;
				106
				107	// " is the end, declare victory.
				108	if (Char() == '"') {
				109	ConsumeChar();
Pallavi Madhukar	e2eb281	2022-07-19 09:56:09 -0700	[diff] [blame]	110	if (unicode_high_surrogate_ != -1) {
				111	fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
				112	data_ = original;
				113	return false;
				114	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	115	return true;
				116	} else {
				117	ConsumeChar();
				118	// Now consume valid escape characters and add their representation onto
				119	// the output string.
				120	if (AtEnd()) {
				121	data_ = original;
				122	return false;
				123	} else if (Char() == '"') {
				124	*s += "\"";
				125	} else if (Char() == '\\') {
				126	*s += "\\";
				127	} else if (Char() == '/') {
				128	*s += "/";
				129	} else if (Char() == 'b') {
				130	*s += "\b";
				131	} else if (Char() == 'f') {
				132	*s += "\f";
				133	} else if (Char() == 'n') {
				134	*s += "\n";
				135	} else if (Char() == 'r') {
				136	*s += "\r";
				137	} else if (Char() == 't') {
				138	*s += "\t";
				139	} else if (Char() == 'u') {
Pallavi Madhukar	e2eb281	2022-07-19 09:56:09 -0700	[diff] [blame]	140	if (!ConsumeUnicode(s)) {
				141	fprintf(stderr, "Invalid unicode on line %d\n", linenumber_);
				142	data_ = original;
				143	return false;
				144	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	145	}
				146	}
				147	// And skip the escaped character.
				148	last_parsed_data = data_.substr(1);
				149	}
				150
				151	ConsumeChar();
				152	}
				153	}
				154
Pallavi Madhukar	e2eb281	2022-07-19 09:56:09 -0700	[diff] [blame]	155	bool Tokenizer::ConsumeUnicode(::std::string *s) {
				156	// Under no conditions is it acceptible to run out of data while parsing a
				157	// unicode. Any AtEnd checks should confirm that.
				158	uint32_t val;
				159
				160	// Consume unicode representation
				161	ConsumeChar();
				162
				163	char target[5];
				164
				165	// Valid unicode is 4 hex digits so evaluate the next 4 characters
				166	for (int count = 0; count < 4; count++) {
				167	// If there is no data or data is an invalid char, return false
				168	if (AtEnd()) {
				169	return false;
				170	}
				171
				172	if (!isxdigit(Char())) {
				173	return false;
				174	}
				175
				176	target[count] = Char();
				177
				178	// Do not consume the last character
				179	if (count == 3) {
				180	break;
				181	}
				182
				183	ConsumeChar();
				184	}
				185	target[4] = '\0';
				186
				187	// References: flatbuffers/src/idl_parser.cpp
				188	val = flatbuffers::StringToUInt(target, 16);
				189
				190	if (val >= 0xD800 && val <= 0xDBFF) {
				191	if (unicode_high_surrogate_ != -1) {
				192	fprintf(stderr, "Invalid unicode - Multiple high surrogates\n");
				193	return false;
				194	} else {
				195	unicode_high_surrogate_ = static_cast<int>(val);
				196	}
				197	} else if (val >= 0xDC00 && val <= 0xDFFF) {
				198	if (unicode_high_surrogate_ == -1) {
				199	fprintf(stderr, "Invalid unicode - Unpaired low surrogate\n");
				200	return false;
				201	} else {
				202	int code_point =
				203	0x10000 + ((unicode_high_surrogate_ & 0x03FF) << 10) + (val & 0x03FF);
				204	flatbuffers::ToUTF8(code_point, s);
				205	unicode_high_surrogate_ = -1;
				206	}
				207	} else {
				208	if (unicode_high_surrogate_ != -1) {
				209	fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
				210	return false;
				211	}
				212	flatbuffers::ToUTF8(static_cast<int>(val), s);
				213	}
				214	return true;
				215	}
				216
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	217	bool Tokenizer::ConsumeNumber(::std::string *s) {
				218	// Under no conditions is it acceptible to run out of data while parsing a
				219	// number. Any AtEnd() checks should confirm that.
				220	*s = ::std::string();
James Kuszmaul	3ae4226	2019-11-08 12:33:41 -0800	[diff] [blame]	221	const std::string_view original = data_;
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	222
				223	// Consume the leading - unconditionally.
				224	Consume("-");
				225
Austin Schuh	bba0c3c	2019-11-29 22:00:34 -0800	[diff] [blame]	226	// See if we find nan. This isn't standards compliant, but is what
				227	// flatbuffers prints out, so we need to parse it.
				228	if (Consume("nan")) {
				229	*s = ::std::string(original.substr(0, original.size() - data_.size()));
				230	return true;
				231	}
				232
Brian Silverman	714b1d6	2020-04-28 16:52:54 -0700	[diff] [blame]	233	// People tend to use null instead of nan. Accept that too.
				234	if (Consume("null")) {
				235	*s = ::std::string("nan");
				236	return true;
				237	}
				238
Austin Schuh	9fa0b8e	2021-03-21 19:21:50 -0700	[diff] [blame]	239	// Inf is also acceptable.
				240	if (Consume("inf")) {
				241	*s = ::std::string(original.substr(0, original.size() - data_.size()));
				242	return true;
				243	}
				244
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	245	// Then, we either get a 0, or we get a nonzero. Only nonzero can be followed
				246	// by a second number.
				247	if (!Consume("0")) {
				248	if (AtEnd()) {
				249	return false;
				250	} else if (Char() >= '1' && Char() <= '9') {
				251	// This wasn't a zero, but was a valid digit. Consume it.
				252	ConsumeChar();
				253	} else {
				254	return false;
				255	}
				256
				257	// Now consume any number of any digits.
				258	while (true) {
				259	if (AtEnd()) {
				260	data_ = original;
				261	return false;
				262	}
				263	if (Char() < '0' \|\| Char() > '9') {
				264	break;
				265	}
				266	ConsumeChar();
				267	}
				268	}
				269
				270	// We could now have a decimal.
				271	if (Char() == '.') {
				272	ConsumeChar();
				273	while (true) {
				274	if (AtEnd()) {
				275	data_ = original;
				276	return false;
				277	}
				278	// And any number of digits.
				279	if (Char() < '0' \|\| Char() > '9') {
				280	break;
				281	}
				282	ConsumeChar();
				283	}
				284	}
				285
				286	// And now an exponent.
				287	if (Char() == 'e' \|\| Char() == 'E') {
				288	ConsumeChar();
				289	if (AtEnd()) {
				290	data_ = original;
				291	return false;
				292	}
				293
				294	// Which could have a +-
				295	if (Char() == '+' \|\| Char() == '-') {
				296	ConsumeChar();
				297	}
				298	int count = 0;
				299	while (true) {
				300	if (AtEnd()) {
				301	data_ = original;
				302	return false;
				303	}
				304	// And digits.
				305	if (Char() < '0' \|\| Char() > '9') {
				306	break;
				307	}
				308	ConsumeChar();
				309	++count;
				310	}
				311	// But, it is an error to have an exponent and nothing following it.
				312	if (count == 0) {
				313	data_ = original;
				314	return false;
				315	}
				316	}
				317
				318	*s = ::std::string(original.substr(0, original.size() - data_.size()));
				319	return true;
				320	}
				321
				322	Tokenizer::TokenType Tokenizer::Next() {
				323	switch (state_) {
				324	case State::kExpectObjectStart:
				325	// We should always start out with a {
Austin Schuh	6f89670	2020-03-19 16:07:20 -0700	[diff] [blame]	326	if (!Consume("{")) {
				327	fprintf(stderr, "Error on line %d, expected { for start.\n",
				328	linenumber_);
				329	return TokenType::kError;
				330	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	331
				332	// Document that we just started an object.
				333	object_type_.push_back(ObjectType::kObject);
				334
				335	ConsumeWhitespace();
				336
				337	if (Consume("}")) {
				338	ConsumeWhitespace();
				339	state_ = State::kExpectObjectEnd;
				340	} else {
				341	state_ = State::kExpectField;
				342	}
				343	return TokenType::kStartObject;
				344
				345	case State::kExpectField: {
				346	// Fields are built up of strings, whitespace, and then a : (followed by
				347	// whitespace...)
				348	::std::string s;
				349	if (!ConsumeString(&s)) {
				350	fprintf(stderr, "Error on line %d, expected string for field name.\n",
				351	linenumber_);
Austin Schuh	9270084	2019-12-28 13:00:17 -0800	[diff] [blame]	352	if (Consume("}")) {
				353	fprintf(stderr,
				354	"Got '}' instead. Did you add an extra trailing ','?\n");
				355	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	356	return TokenType::kError;
				357	}
				358	field_name_ = ::std::move(s);
				359
				360	ConsumeWhitespace();
				361
				362	if (!Consume(":")) {
Austin Schuh	2595a14	2020-11-29 22:43:57 -0800	[diff] [blame]	363	fprintf(stderr, "Error on line %d, expected ':', got '%c'\n",
				364	linenumber_, Char());
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	365	return TokenType::kError;
				366	}
				367
				368	ConsumeWhitespace();
				369
				370	state_ = State::kExpectValue;
				371
				372	return TokenType::kField;
				373	} break;
				374	case State::kExpectValue: {
				375	TokenType result = TokenType::kError;
				376
				377	::std::string s;
				378	if (Consume("{")) {
				379	// Fields are in objects. Record and recurse.
				380	object_type_.push_back(ObjectType::kObject);
				381
				382	ConsumeWhitespace();
				383
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	384	// And then if we encounter the end again, go to the end state.
				385	if (Consume("}")) {
				386	ConsumeWhitespace();
				387	state_ = State::kExpectObjectEnd;
				388	} else {
				389	state_ = State::kExpectField;
				390	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	391	return TokenType::kStartObject;
				392	} else if (Consume("[")) {
				393	// Values are in arrays. Record and recurse.
				394	object_type_.push_back(ObjectType::kArray);
				395
				396	ConsumeWhitespace();
				397	state_ = State::kExpectValue;
				398	return TokenType::kStartArray;
				399	} else if (ConsumeString(&s)) {
				400	// Parsed as a string, grab it.
				401	field_value_ = ::std::move(s);
				402	result = TokenType::kStringValue;
				403	} else if (ConsumeNumber(&s)) {
				404	// Parsed as a number, grab it.
				405	field_value_ = ::std::move(s);
				406	result = TokenType::kNumberValue;
				407	} else if (Consume("true")) {
				408	// Parsed as a true, grab it.
				409	field_value_ = "true";
				410	result = TokenType::kTrueValue;
				411	} else if (Consume("false")) {
				412	// Parsed as a false, grab it.
				413	field_value_ = "false";
				414	result = TokenType::kFalseValue;
				415	} else {
Alex Perry	cb7da4b	2019-08-28 19:35:56 -0700	[diff] [blame]	416	switch (object_type_.back()) {
				417	case ObjectType::kObject:
				418	if (Consume("}")) {
				419	ConsumeWhitespace();
				420	state_ = State::kExpectObjectEnd;
				421	return Next();
				422	}
				423	break;
				424	case ObjectType::kArray:
				425	if (Consume("]")) {
				426	ConsumeWhitespace();
				427	state_ = State::kExpectArrayEnd;
				428	return Next();
				429	}
				430	break;
				431	}
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	432	// Couldn't parse, so we have a syntax error.
				433	fprintf(stderr, "Error line %d, invalid field value.\n", linenumber_);
				434	}
				435
				436	ConsumeWhitespace();
				437
				438	// After a field, we either have a , and another field (or value if we are
				439	// in an array), or we should be closing out the object (or array).
				440	if (Consume(",")) {
				441	ConsumeWhitespace();
				442	switch (object_type_.back()) {
				443	case ObjectType::kObject:
				444	state_ = State::kExpectField;
				445	break;
				446	case ObjectType::kArray:
				447	state_ = State::kExpectValue;
				448	break;
				449	}
				450	} else {
				451	// Sanity check that the stack is deep enough.
				452	if (object_type_.size() == 0) {
				453	fprintf(stderr, "Error on line %d\n", linenumber_);
				454	return TokenType::kError;
				455	}
				456
				457	// And then require closing out the object.
				458	switch (object_type_.back()) {
				459	case ObjectType::kObject:
				460	if (Consume("}")) {
				461	ConsumeWhitespace();
				462	state_ = State::kExpectObjectEnd;
				463	} else {
Austin Schuh	60e7794	2022-05-16 17:48:24 -0700	[diff] [blame]	464	fprintf(stderr, "Error on line %d, expected } or ,\n",
				465	linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	466	return TokenType::kError;
				467	}
				468	break;
				469	case ObjectType::kArray:
				470	if (Consume("]")) {
				471	ConsumeWhitespace();
				472	state_ = State::kExpectArrayEnd;
				473	} else {
Austin Schuh	60e7794	2022-05-16 17:48:24 -0700	[diff] [blame]	474	fprintf(stderr, "Error on line %d, expected ] or ,\n",
				475	linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	476	return TokenType::kError;
				477	}
				478	break;
				479	}
				480	}
				481	return result;
				482	} break;
				483
				484	case State::kExpectArrayEnd:
				485	case State::kExpectObjectEnd: {
				486	const TokenType result = state_ == State::kExpectArrayEnd
				487	? TokenType::kEndArray
				488	: TokenType::kEndObject;
				489	// This is a transient state so we can send 2 tokens out in a row. We
				490	// discover the object or array end at the end of reading the value.
				491	object_type_.pop_back();
				492	if (object_type_.size() == 0) {
				493	// We unwound the outer object. We should send kEnd next.
				494	state_ = State::kExpectEnd;
				495	} else if (object_type_.back() == ObjectType::kObject) {
				496	// If we are going into an object, it should either have another field
				497	// or end.
				498	if (Consume(",")) {
				499	ConsumeWhitespace();
				500	state_ = State::kExpectField;
				501	} else if (Consume("}")) {
				502	ConsumeWhitespace();
				503	state_ = State::kExpectObjectEnd;
				504	} else {
Austin Schuh	217a978	2019-12-21 23:02:50 -0800	[diff] [blame]	505	fprintf(stderr, "Error on line %d, expected , or }\n", linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	506	return TokenType::kError;
				507	}
				508	} else if (object_type_.back() == ObjectType::kArray) {
				509	// If we are going into an array, it should either have another value
				510	// or end.
				511	if (Consume(",")) {
				512	ConsumeWhitespace();
				513	state_ = State::kExpectValue;
				514	} else if (Consume("]")) {
				515	ConsumeWhitespace();
				516	state_ = State::kExpectArrayEnd;
				517	} else {
Austin Schuh	217a978	2019-12-21 23:02:50 -0800	[diff] [blame]	518	fprintf(stderr, "Error on line %d, expected , or ]\n", linenumber_);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	519	return TokenType::kError;
				520	}
				521	}
				522	// And then send out the correct token.
				523	return result;
				524	}
				525	case State::kExpectEnd:
				526	// If we are supposed to be done, confirm nothing is after the end.
				527	if (AtEnd()) {
				528	return TokenType::kEnd;
				529	} else {
				530	fprintf(stderr, "Data past end at line %d\n", linenumber_);
				531	return TokenType::kError;
				532	}
				533	}
				534	return TokenType::kError;
				535	}
				536
James Kuszmaul	768c468	2023-10-12 21:07:16 -0700	[diff] [blame]	537	bool Tokenizer::FieldAsInt(absl::int128 *value) {
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	538	const char *pos = field_value().c_str();
James Kuszmaul	768c468	2023-10-12 21:07:16 -0700	[diff] [blame]	539	return absl::SimpleAtoi(pos, value);
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	540	}
				541
				542	bool Tokenizer::FieldAsDouble(double *value) {
				543	const char *pos = field_value().c_str();
				544	errno = 0;
Austin Schuh	bba0c3c	2019-11-29 22:00:34 -0800	[diff] [blame]	545	if (field_value() == "nan") {
				546	*value = std::numeric_limits<double>::quiet_NaN();
				547	return true;
				548	} else if (field_value() == "-nan") {
				549	*value = -std::numeric_limits<double>::quiet_NaN();
				550	return true;
				551	}
				552
Austin Schuh	9fa0b8e	2021-03-21 19:21:50 -0700	[diff] [blame]	553	if (field_value() == "inf") {
				554	*value = std::numeric_limits<double>::infinity();
				555	return true;
				556	} else if (field_value() == "-inf") {
				557	*value = -std::numeric_limits<double>::infinity();
				558	return true;
				559	}
				560
Austin Schuh	d7e252d	2019-10-06 13:51:02 -0700	[diff] [blame]	561	value = strtod(field_value().c_str(), const_cast<char *>(&pos));
				562
				563	if (pos != field_value().c_str() + field_value().size() \|\| errno != 0) {
				564	return false;
				565	}
				566	return true;
				567	}
				568
				569	} // namespace aos