Blame - src/google/protobuf/io/tokenizer_unittest.cc - RealtimeRoboticsGroup/test

blob: e55288e2ff018c205fa9853046b6b777224884de [file] [log] [blame]

Brian Silverman	9c614bc	2016-02-15 20:20:02 -0500	[diff] [blame]	1	// Protocol Buffers - Google's data interchange format
				2	// Copyright 2008 Google Inc. All rights reserved.
				3	// https://developers.google.com/protocol-buffers/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are
				7	// met:
				8	//
				9	// * Redistributions of source code must retain the above copyright
				10	// notice, this list of conditions and the following disclaimer.
				11	// * Redistributions in binary form must reproduce the above
				12	// copyright notice, this list of conditions and the following disclaimer
				13	// in the documentation and/or other materials provided with the
				14	// distribution.
				15	// * Neither the name of Google Inc. nor the names of its
				16	// contributors may be used to endorse or promote products derived from
				17	// this software without specific prior written permission.
				18	//
				19	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				20	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				21	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				22	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				23	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				24	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				25	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				26	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				27	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				28	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				29	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				30
				31	// Author: kenton@google.com (Kenton Varda)
				32	// Based on original Protocol Buffers design by
				33	// Sanjay Ghemawat, Jeff Dean, and others.
				34
				35	#include <limits.h>
				36	#include <math.h>
				37
				38	#include <vector>
				39
				40	#include <google/protobuf/io/tokenizer.h>
				41	#include <google/protobuf/io/zero_copy_stream_impl.h>
				42
				43	#include <google/protobuf/stubs/common.h>
				44	#include <google/protobuf/stubs/logging.h>
				45	#include <google/protobuf/stubs/strutil.h>
				46	#include <google/protobuf/stubs/substitute.h>
				47	#include <google/protobuf/testing/googletest.h>
				48	#include <gtest/gtest.h>
				49
				50	namespace google {
				51	namespace protobuf {
				52	namespace io {
				53	namespace {
				54
				55	// ===================================================================
				56	// Data-Driven Test Infrastructure
				57
				58	// TODO(kenton): This is copied from coded_stream_unittest. This is
				59	// temporary until these fetaures are integrated into gTest itself.
				60
				61	// TEST_1D and TEST_2D are macros I'd eventually like to see added to
				62	// gTest. These macros can be used to declare tests which should be
				63	// run multiple times, once for each item in some input array. TEST_1D
				64	// tests all cases in a single input array. TEST_2D tests all
				65	// combinations of cases from two arrays. The arrays must be statically
				66	// defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example:
				67	//
				68	// int kCases[] = {1, 2, 3, 4}
				69	// TEST_1D(MyFixture, MyTest, kCases) {
				70	// EXPECT_GT(kCases_case, 0);
				71	// }
				72	//
				73	// This test iterates through the numbers 1, 2, 3, and 4 and tests that
				74	// they are all grater than zero. In case of failure, the exact case
				75	// which failed will be printed. The case type must be printable using
				76	// ostream::operator<<.
				77
				78	#define TEST_1D(FIXTURE, NAME, CASES) \
				79	class FIXTURE##_##NAME##_DD : public FIXTURE { \
				80	protected: \
				81	template <typename CaseType> \
				82	void DoSingleCase(const CaseType& CASES##_case); \
				83	}; \
				84	\
				85	TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
				86	for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \
				87	SCOPED_TRACE(testing::Message() \
				88	<< #CASES " case #" << i << ": " << CASES[i]); \
				89	DoSingleCase(CASES[i]); \
				90	} \
				91	} \
				92	\
				93	template <typename CaseType> \
				94	void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
				95
				96	#define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
				97	class FIXTURE##_##NAME##_DD : public FIXTURE { \
				98	protected: \
				99	template <typename CaseType1, typename CaseType2> \
				100	void DoSingleCase(const CaseType1& CASES1##_case, \
				101	const CaseType2& CASES2##_case); \
				102	}; \
				103	\
				104	TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
				105	for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \
				106	for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \
				107	SCOPED_TRACE(testing::Message() \
				108	<< #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
				109	<< #CASES2 " case #" << j << ": " << CASES2[j]); \
				110	DoSingleCase(CASES1[i], CASES2[j]); \
				111	} \
				112	} \
				113	} \
				114	\
				115	template <typename CaseType1, typename CaseType2> \
				116	void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
				117	const CaseType2& CASES2##_case)
				118
				119	// -------------------------------------------------------------------
				120
				121	// An input stream that is basically like an ArrayInputStream but sometimes
				122	// returns empty buffers, just to throw us off.
				123	class TestInputStream : public ZeroCopyInputStream {
				124	public:
				125	TestInputStream(const void* data, int size, int block_size)
				126	: array_stream_(data, size, block_size), counter_(0) {}
				127	~TestInputStream() {}
				128
				129	// implements ZeroCopyInputStream ----------------------------------
				130	bool Next(const void** data, int* size) {
				131	// We'll return empty buffers starting with the first buffer, and every
				132	// 3 and 5 buffers after that.
				133	if (counter_ % 3 == 0 \|\| counter_ % 5 == 0) {
				134	*data = NULL;
				135	*size = 0;
				136	++counter_;
				137	return true;
				138	} else {
				139	++counter_;
				140	return array_stream_.Next(data, size);
				141	}
				142	}
				143
				144	void BackUp(int count) { return array_stream_.BackUp(count); }
				145	bool Skip(int count) { return array_stream_.Skip(count); }
				146	int64 ByteCount() const { return array_stream_.ByteCount(); }
				147
				148	private:
				149	ArrayInputStream array_stream_;
				150	int counter_;
				151	};
				152
				153	// -------------------------------------------------------------------
				154
				155	// An error collector which simply concatenates all its errors into a big
				156	// block of text which can be checked.
				157	class TestErrorCollector : public ErrorCollector {
				158	public:
				159	TestErrorCollector() {}
				160	~TestErrorCollector() {}
				161
				162	string text_;
				163
				164	// implements ErrorCollector ---------------------------------------
				165	void AddError(int line, int column, const string& message) {
				166	strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
				167	line, column, message);
				168	}
				169	};
				170
				171	// -------------------------------------------------------------------
				172
				173	// We test each operation over a variety of block sizes to insure that
				174	// we test cases where reads cross buffer boundaries as well as cases
				175	// where they don't. This is sort of a brute-force approach to this,
				176	// but it's easy to write and easy to understand.
				177	const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
				178
				179	class TokenizerTest : public testing::Test {
				180	protected:
				181	// For easy testing.
				182	uint64 ParseInteger(const string& text) {
				183	uint64 result;
				184	EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
				185	return result;
				186	}
				187	};
				188
				189	// ===================================================================
				190
				191	// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
				192	// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
				193	#if !defined(__GNUC__) \|\| __GNUC__ > 3 \|\| (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
				194
				195	// In each test case, the entire input text should parse as a single token
				196	// of the given type.
				197	struct SimpleTokenCase {
				198	string input;
				199	Tokenizer::TokenType type;
				200	};
				201
Austin Schuh	40c1652	2018-10-28 20:27:54 -0700	[diff] [blame^]	202	inline std::ostream& operator<<(std::ostream& out,
				203	const SimpleTokenCase& test_case) {
Brian Silverman	9c614bc	2016-02-15 20:20:02 -0500	[diff] [blame]	204	return out << CEscape(test_case.input);
				205	}
				206
				207	SimpleTokenCase kSimpleTokenCases[] = {
				208	// Test identifiers.
				209	{ "hello", Tokenizer::TYPE_IDENTIFIER },
				210
				211	// Test integers.
				212	{ "123", Tokenizer::TYPE_INTEGER },
				213	{ "0xab6", Tokenizer::TYPE_INTEGER },
				214	{ "0XAB6", Tokenizer::TYPE_INTEGER },
				215	{ "0X1234567", Tokenizer::TYPE_INTEGER },
				216	{ "0x89abcdef", Tokenizer::TYPE_INTEGER },
				217	{ "0x89ABCDEF", Tokenizer::TYPE_INTEGER },
				218	{ "01234567", Tokenizer::TYPE_INTEGER },
				219
				220	// Test floats.
				221	{ "123.45", Tokenizer::TYPE_FLOAT },
				222	{ "1.", Tokenizer::TYPE_FLOAT },
				223	{ "1e3", Tokenizer::TYPE_FLOAT },
				224	{ "1E3", Tokenizer::TYPE_FLOAT },
				225	{ "1e-3", Tokenizer::TYPE_FLOAT },
				226	{ "1e+3", Tokenizer::TYPE_FLOAT },
				227	{ "1.e3", Tokenizer::TYPE_FLOAT },
				228	{ "1.2e3", Tokenizer::TYPE_FLOAT },
				229	{ ".1", Tokenizer::TYPE_FLOAT },
				230	{ ".1e3", Tokenizer::TYPE_FLOAT },
				231	{ ".1e-3", Tokenizer::TYPE_FLOAT },
				232	{ ".1e+3", Tokenizer::TYPE_FLOAT },
				233
				234	// Test strings.
				235	{ "'hello'", Tokenizer::TYPE_STRING },
				236	{ "\"foo\"", Tokenizer::TYPE_STRING },
				237	{ "'a\"b'", Tokenizer::TYPE_STRING },
				238	{ "\"a'b\"", Tokenizer::TYPE_STRING },
				239	{ "'a\\'b'", Tokenizer::TYPE_STRING },
				240	{ "\"a\\\"b\"", Tokenizer::TYPE_STRING },
				241	{ "'\\xf'", Tokenizer::TYPE_STRING },
				242	{ "'\\0'", Tokenizer::TYPE_STRING },
				243
				244	// Test symbols.
				245	{ "+", Tokenizer::TYPE_SYMBOL },
				246	{ ".", Tokenizer::TYPE_SYMBOL },
				247	};
				248
				249	TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
				250	// Set up the tokenizer.
				251	TestInputStream input(kSimpleTokenCases_case.input.data(),
				252	kSimpleTokenCases_case.input.size(),
				253	kBlockSizes_case);
				254	TestErrorCollector error_collector;
				255	Tokenizer tokenizer(&input, &error_collector);
				256
				257	// Before Next() is called, the initial token should always be TYPE_START.
				258	EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
				259	EXPECT_EQ("", tokenizer.current().text);
				260	EXPECT_EQ(0, tokenizer.current().line);
				261	EXPECT_EQ(0, tokenizer.current().column);
				262	EXPECT_EQ(0, tokenizer.current().end_column);
				263
				264	// Parse the token.
				265	ASSERT_TRUE(tokenizer.Next());
				266
				267	// Check that it has the right type.
				268	EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
				269	// Check that it contains the complete input text.
				270	EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
				271	// Check that it is located at the beginning of the input
				272	EXPECT_EQ(0, tokenizer.current().line);
				273	EXPECT_EQ(0, tokenizer.current().column);
				274	EXPECT_EQ(kSimpleTokenCases_case.input.size(),
				275	tokenizer.current().end_column);
				276
				277	// There should be no more input.
				278	EXPECT_FALSE(tokenizer.Next());
				279
				280	// After Next() returns false, the token should have type TYPE_END.
				281	EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
				282	EXPECT_EQ("", tokenizer.current().text);
				283	EXPECT_EQ(0, tokenizer.current().line);
				284	EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
				285	EXPECT_EQ(kSimpleTokenCases_case.input.size(),
				286	tokenizer.current().end_column);
				287
				288	// There should be no errors.
				289	EXPECT_TRUE(error_collector.text_.empty());
				290	}
				291
				292	TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
				293	// Test the "allow_f_after_float" option.
				294
				295	// Set up the tokenizer.
				296	const char* text = "1f 2.5f 6e3f 7F";
				297	TestInputStream input(text, strlen(text), kBlockSizes_case);
				298	TestErrorCollector error_collector;
				299	Tokenizer tokenizer(&input, &error_collector);
				300	tokenizer.set_allow_f_after_float(true);
				301
				302	// Advance through tokens and check that they are parsed as expected.
				303	ASSERT_TRUE(tokenizer.Next());
				304	EXPECT_EQ(tokenizer.current().text, "1f");
				305	EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
				306	ASSERT_TRUE(tokenizer.Next());
				307	EXPECT_EQ(tokenizer.current().text, "2.5f");
				308	EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
				309	ASSERT_TRUE(tokenizer.Next());
				310	EXPECT_EQ(tokenizer.current().text, "6e3f");
				311	EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
				312	ASSERT_TRUE(tokenizer.Next());
				313	EXPECT_EQ(tokenizer.current().text, "7F");
				314	EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
				315
				316	// There should be no more input.
				317	EXPECT_FALSE(tokenizer.Next());
				318	// There should be no errors.
				319	EXPECT_TRUE(error_collector.text_.empty());
				320	}
				321
				322	#endif
				323
				324	// -------------------------------------------------------------------
				325
				326	// In each case, the input is parsed to produce a list of tokens. The
				327	// last token in "output" must have type TYPE_END.
				328	struct MultiTokenCase {
				329	string input;
				330	Tokenizer::Token output[10]; // The compiler wants a constant array
				331	// size for initialization to work. There
				332	// is no reason this can't be increased if
				333	// needed.
				334	};
				335
Austin Schuh	40c1652	2018-10-28 20:27:54 -0700	[diff] [blame^]	336	inline std::ostream& operator<<(std::ostream& out,
				337	const MultiTokenCase& test_case) {
Brian Silverman	9c614bc	2016-02-15 20:20:02 -0500	[diff] [blame]	338	return out << CEscape(test_case.input);
				339	}
				340
				341	MultiTokenCase kMultiTokenCases[] = {
				342	// Test empty input.
				343	{ "", {
Austin Schuh	40c1652	2018-10-28 20:27:54 -0700	[diff] [blame^]	344	{ Tokenizer::TYPE_END , "" , 0, 0, 0 },
Brian Silverman	9c614bc	2016-02-15 20:20:02 -0500	[diff] [blame]	345	}},
				346
				347	// Test all token types at the same time.
				348	{ "foo 1 1.2 + 'bar'", {
				349	{ Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0, 3 },
				350	{ Tokenizer::TYPE_INTEGER , "1" , 0, 4, 5 },
				351	{ Tokenizer::TYPE_FLOAT , "1.2" , 0, 6, 9 },
				352	{ Tokenizer::TYPE_SYMBOL , "+" , 0, 10, 11 },
				353	{ Tokenizer::TYPE_STRING , "'bar'", 0, 12, 17 },
				354	{ Tokenizer::TYPE_END , "" , 0, 17, 17 },
				355	}},
				356
				357	// Test that consecutive symbols are parsed as separate tokens.
				358	{ "!@+%", {
				359	{ Tokenizer::TYPE_SYMBOL , "!" , 0, 0, 1 },
				360	{ Tokenizer::TYPE_SYMBOL , "@" , 0, 1, 2 },
				361	{ Tokenizer::TYPE_SYMBOL , "+" , 0, 2, 3 },
				362	{ Tokenizer::TYPE_SYMBOL , "%" , 0, 3, 4 },
				363	{ Tokenizer::TYPE_END , "" , 0, 4, 4 },
				364	}},
				365
				366	// Test that newlines affect line numbers correctly.
				367	{ "foo bar\nrab oof", {
				368	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				369	{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7 },
				370	{ Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3 },
				371	{ Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7 },
				372	{ Tokenizer::TYPE_END , "" , 1, 7, 7 },
				373	}},
				374
				375	// Test that tabs affect column numbers correctly.
				376	{ "foo\tbar \tbaz", {
				377	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				378	{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11 },
				379	{ Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
				380	{ Tokenizer::TYPE_END , "" , 0, 19, 19 },
				381	}},
				382
				383	// Test that tabs in string literals affect column numbers correctly.
				384	{ "\"foo\tbar\" baz", {
				385	{ Tokenizer::TYPE_STRING , "\"foo\tbar\"", 0, 0, 12 },
				386	{ Tokenizer::TYPE_IDENTIFIER, "baz" , 0, 13, 16 },
				387	{ Tokenizer::TYPE_END , "" , 0, 16, 16 },
				388	}},
				389
				390	// Test that line comments are ignored.
				391	{ "foo // This is a comment\n"
				392	"bar // This is another comment", {
				393	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				394	{ Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3 },
				395	{ Tokenizer::TYPE_END , "" , 1, 30, 30 },
				396	}},
				397
				398	// Test that block comments are ignored.
				399	{ "foo /* This is a block comment */ bar", {
				400	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				401	{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
				402	{ Tokenizer::TYPE_END , "" , 0, 37, 37 },
				403	}},
				404
				405	// Test that sh-style comments are not ignored by default.
				406	{ "foo # bar\n"
				407	"baz", {
				408	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				409	{ Tokenizer::TYPE_SYMBOL , "#" , 0, 4, 5 },
				410	{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
				411	{ Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
				412	{ Tokenizer::TYPE_END , "" , 1, 3, 3 },
				413	}},
				414
				415	// Test all whitespace chars
				416	{ "foo\n\t\r\v\fbar", {
				417	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				418	{ Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
				419	{ Tokenizer::TYPE_END , "" , 1, 14, 14 },
				420	}},
				421	};
				422
				423	TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
				424	// Set up the tokenizer.
				425	TestInputStream input(kMultiTokenCases_case.input.data(),
				426	kMultiTokenCases_case.input.size(),
				427	kBlockSizes_case);
				428	TestErrorCollector error_collector;
				429	Tokenizer tokenizer(&input, &error_collector);
				430
				431	// Before Next() is called, the initial token should always be TYPE_START.
				432	EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
				433	EXPECT_EQ("", tokenizer.current().text);
				434	EXPECT_EQ(0, tokenizer.current().line);
				435	EXPECT_EQ(0, tokenizer.current().column);
				436	EXPECT_EQ(0, tokenizer.current().end_column);
				437
				438	// Loop through all expected tokens.
				439	int i = 0;
				440	Tokenizer::Token token;
				441	do {
				442	token = kMultiTokenCases_case.output[i++];
				443
				444	SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
				445
				446	Tokenizer::Token previous = tokenizer.current();
				447
				448	// Next() should only return false when it hits the end token.
				449	if (token.type != Tokenizer::TYPE_END) {
				450	ASSERT_TRUE(tokenizer.Next());
				451	} else {
				452	ASSERT_FALSE(tokenizer.Next());
				453	}
				454
				455	// Check that the previous token is set correctly.
				456	EXPECT_EQ(previous.type, tokenizer.previous().type);
				457	EXPECT_EQ(previous.text, tokenizer.previous().text);
				458	EXPECT_EQ(previous.line, tokenizer.previous().line);
				459	EXPECT_EQ(previous.column, tokenizer.previous().column);
				460	EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
				461
				462	// Check that the token matches the expected one.
				463	EXPECT_EQ(token.type, tokenizer.current().type);
				464	EXPECT_EQ(token.text, tokenizer.current().text);
				465	EXPECT_EQ(token.line, tokenizer.current().line);
				466	EXPECT_EQ(token.column, tokenizer.current().column);
				467	EXPECT_EQ(token.end_column, tokenizer.current().end_column);
				468
				469	} while (token.type != Tokenizer::TYPE_END);
				470
				471	// There should be no errors.
				472	EXPECT_TRUE(error_collector.text_.empty());
				473	}
				474
				475	// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
				476	// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
				477	#if !defined(__GNUC__) \|\| __GNUC__ > 3 \|\| (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
				478
				479	TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
				480	// Test the "comment_style" option.
				481
				482	const char* text = "foo # bar\n"
				483	"baz // qux\n"
				484	"corge /* grault */\n"
				485	"garply";
				486	const char* const kTokens[] = {"foo", // "# bar" is ignored
				487	"baz", "/", "/", "qux",
				488	"corge", "/", "", "grault", "", "/",
				489	"garply"};
				490
				491	// Set up the tokenizer.
				492	TestInputStream input(text, strlen(text), kBlockSizes_case);
				493	TestErrorCollector error_collector;
				494	Tokenizer tokenizer(&input, &error_collector);
				495	tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
				496
				497	// Advance through tokens and check that they are parsed as expected.
				498	for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
				499	EXPECT_TRUE(tokenizer.Next());
				500	EXPECT_EQ(tokenizer.current().text, kTokens[i]);
				501	}
				502
				503	// There should be no more input.
				504	EXPECT_FALSE(tokenizer.Next());
				505	// There should be no errors.
				506	EXPECT_TRUE(error_collector.text_.empty());
				507	}
				508
				509	#endif
				510
				511	// -------------------------------------------------------------------
				512
				513	// In each case, the input is expected to have two tokens named "prev" and
				514	// "next" with comments in between.
				515	struct DocCommentCase {
				516	string input;
				517
				518	const char* prev_trailing_comments;
				519	const char* detached_comments[10];
				520	const char* next_leading_comments;
				521	};
				522
Austin Schuh	40c1652	2018-10-28 20:27:54 -0700	[diff] [blame^]	523	inline std::ostream& operator<<(std::ostream& out,
				524	const DocCommentCase& test_case) {
Brian Silverman	9c614bc	2016-02-15 20:20:02 -0500	[diff] [blame]	525	return out << CEscape(test_case.input);
				526	}
				527
				528	DocCommentCase kDocCommentCases[] = {
				529	{
				530	"prev next",
				531
				532	"",
				533	{},
				534	""
				535	},
				536
				537	{
				538	"prev /* ignored */ next",
				539
				540	"",
				541	{},
				542	""
				543	},
				544
				545	{
				546	"prev // trailing comment\n"
				547	"next",
				548
				549	" trailing comment\n",
				550	{},
				551	""
				552	},
				553
				554	{
				555	"prev\n"
				556	"// leading comment\n"
				557	"// line 2\n"
				558	"next",
				559
				560	"",
				561	{},
				562	" leading comment\n"
				563	" line 2\n"
				564	},
				565
				566	{
				567	"prev\n"
				568	"// trailing comment\n"
				569	"// line 2\n"
				570	"\n"
				571	"next",
				572
				573	" trailing comment\n"
				574	" line 2\n",
				575	{},
				576	""
				577	},
				578
				579	{
				580	"prev // trailing comment\n"
				581	"// leading comment\n"
				582	"// line 2\n"
				583	"next",
				584
				585	" trailing comment\n",
				586	{},
				587	" leading comment\n"
				588	" line 2\n"
				589	},
				590
				591	{
				592	"prev /* trailing block comment */\n"
				593	"/* leading block comment\n"
				594	" * line 2\n"
				595	" * line 3 */"
				596	"next",
				597
				598	" trailing block comment ",
				599	{},
				600	" leading block comment\n"
				601	" line 2\n"
				602	" line 3 "
				603	},
				604
				605	{
				606	"prev\n"
				607	"/* trailing block comment\n"
				608	" * line 2\n"
				609	" * line 3\n"
				610	" */\n"
				611	"/* leading block comment\n"
				612	" * line 2\n"
				613	" * line 3 */"
				614	"next",
				615
				616	" trailing block comment\n"
				617	" line 2\n"
				618	" line 3\n",
				619	{},
				620	" leading block comment\n"
				621	" line 2\n"
				622	" line 3 "
				623	},
				624
				625	{
				626	"prev\n"
				627	"// trailing comment\n"
				628	"\n"
				629	"// detached comment\n"
				630	"// line 2\n"
				631	"\n"
				632	"// second detached comment\n"
				633	"/* third detached comment\n"
				634	" * line 2 */\n"
				635	"// leading comment\n"
				636	"next",
				637
				638	" trailing comment\n",
				639	{
				640	" detached comment\n"
				641	" line 2\n",
				642	" second detached comment\n",
				643	" third detached comment\n"
				644	" line 2 "
				645	},
				646	" leading comment\n"
				647	},
				648
				649	{
				650	"prev /**/\n"
				651	"\n"
				652	"// detached comment\n"
				653	"\n"
				654	"// leading comment\n"
				655	"next",
				656
				657	"",
				658	{
				659	" detached comment\n"
				660	},
				661	" leading comment\n"
				662	},
				663
				664	{
				665	"prev /**/\n"
				666	"// leading comment\n"
				667	"next",
				668
				669	"",
				670	{},
				671	" leading comment\n"
				672	},
				673	};
				674
				675	TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
				676	// Set up the tokenizer.
				677	TestInputStream input(kDocCommentCases_case.input.data(),
				678	kDocCommentCases_case.input.size(),
				679	kBlockSizes_case);
				680	TestErrorCollector error_collector;
				681	Tokenizer tokenizer(&input, &error_collector);
				682
				683	// Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
				684	TestInputStream input2(kDocCommentCases_case.input.data(),
				685	kDocCommentCases_case.input.size(),
				686	kBlockSizes_case);
				687	Tokenizer tokenizer2(&input2, &error_collector);
				688
				689	tokenizer.Next();
				690	tokenizer2.Next();
				691
				692	EXPECT_EQ("prev", tokenizer.current().text);
				693	EXPECT_EQ("prev", tokenizer2.current().text);
				694
				695	string prev_trailing_comments;
Austin Schuh	40c1652	2018-10-28 20:27:54 -0700	[diff] [blame^]	696	std::vector<string> detached_comments;
Brian Silverman	9c614bc	2016-02-15 20:20:02 -0500	[diff] [blame]	697	string next_leading_comments;
				698	tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
				699	&next_leading_comments);
				700	tokenizer2.NextWithComments(NULL, NULL, NULL);
				701	EXPECT_EQ("next", tokenizer.current().text);
				702	EXPECT_EQ("next", tokenizer2.current().text);
				703
				704	EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
				705	prev_trailing_comments);
				706
				707	for (int i = 0; i < detached_comments.size(); i++) {
				708	ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
				709	ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
				710	EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
				711	detached_comments[i]);
				712	}
				713
				714	// Verify that we matched all the detached comments.
				715	EXPECT_EQ(NULL,
				716	kDocCommentCases_case.detached_comments[detached_comments.size()]);
				717
				718	EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
				719	next_leading_comments);
				720	}
				721
				722	// -------------------------------------------------------------------
				723
				724	// Test parse helpers. It's not really worth setting up a full data-driven
				725	// test here.
				726	TEST_F(TokenizerTest, ParseInteger) {
				727	EXPECT_EQ(0, ParseInteger("0"));
				728	EXPECT_EQ(123, ParseInteger("123"));
				729	EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
				730	EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
				731	EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
				732	EXPECT_EQ(01234567, ParseInteger("01234567"));
				733	EXPECT_EQ(0X123, ParseInteger("0X123"));
				734
				735	// Test invalid integers that may still be tokenized as integers.
				736	EXPECT_EQ(0, ParseInteger("0x"));
				737
				738	uint64 i;
Austin Schuh	40c1652	2018-10-28 20:27:54 -0700	[diff] [blame^]	739
Brian Silverman	9c614bc	2016-02-15 20:20:02 -0500	[diff] [blame]	740	// Test invalid integers that will never be tokenized as integers.
Austin Schuh	40c1652	2018-10-28 20:27:54 -0700	[diff] [blame^]	741	EXPECT_FALSE(Tokenizer::ParseInteger("zxy", kuint64max, &i));
				742	EXPECT_FALSE(Tokenizer::ParseInteger("1.2", kuint64max, &i));
				743	EXPECT_FALSE(Tokenizer::ParseInteger("08", kuint64max, &i));
				744	EXPECT_FALSE(Tokenizer::ParseInteger("0xg", kuint64max, &i));
				745	EXPECT_FALSE(Tokenizer::ParseInteger("-1", kuint64max, &i));
Brian Silverman	9c614bc	2016-02-15 20:20:02 -0500	[diff] [blame]	746
				747	// Test overflows.
				748	EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
				749	EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
				750	EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
				751	EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
				752	EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
				753	EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
				754	EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
				755	}
				756
				757	TEST_F(TokenizerTest, ParseFloat) {
				758	EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1."));
				759	EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3"));
				760	EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3"));
				761	EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
				762	EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1"));
				763	EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25"));
				764	EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
				765	EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
				766	EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
				767	EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
				768	EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5"));
				769	EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
				770	EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2"));
				771	EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
				772
				773	// Test invalid integers that may still be tokenized as integers.
				774	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
				775	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
				776	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
				777
				778	// Test 'f' suffix.
				779	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
				780	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
				781	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
				782
				783	// These should parse successfully even though they are out of range.
				784	// Overflows become infinity and underflows become zero.
				785	EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
				786	EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
				787
				788	#ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
				789	// Test invalid integers that will never be tokenized as integers.
				790	EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
				791	"passed text that could not have been tokenized as a float");
				792	EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
				793	"passed text that could not have been tokenized as a float");
				794	EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
				795	"passed text that could not have been tokenized as a float");
				796	#endif // PROTOBUF_HAS_DEATH_TEST
				797	}
				798
				799	TEST_F(TokenizerTest, ParseString) {
				800	string output;
				801	Tokenizer::ParseString("'hello'", &output);
				802	EXPECT_EQ("hello", output);
				803	Tokenizer::ParseString("\"blah\\nblah2\"", &output);
				804	EXPECT_EQ("blah\nblah2", output);
				805	Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
				806	EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
				807	Tokenizer::ParseString("'\\x20\\x4'", &output);
				808	EXPECT_EQ("\x20\x4", output);
				809
				810	// Test invalid strings that may still be tokenized as strings.
				811	Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
				812	EXPECT_EQ("\a?\v\t", output);
				813	Tokenizer::ParseString("'", &output);
				814	EXPECT_EQ("", output);
				815	Tokenizer::ParseString("'\\", &output);
				816	EXPECT_EQ("\\", output);
				817
				818	// Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
				819	// characters.
				820	Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
				821	EXPECT_EQ("$¢€𤭢XX", output);
				822	// Same thing encoded using UTF16.
				823	Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
				824	EXPECT_EQ("$¢€𤭢XX", output);
				825	// Here's some broken UTF16; there's a head surrogate with no tail surrogate.
				826	// We just output this as if it were UTF8; it's not a defined code point, but
				827	// it has a defined encoding.
				828	Tokenizer::ParseString("'\\ud852XX'", &output);
				829	EXPECT_EQ("\xed\xa1\x92XX", output);
				830	// Malformed escape: Demons may fly out of the nose.
				831	Tokenizer::ParseString("\\u0", &output);
				832	EXPECT_EQ("u0", output);
				833
				834	// Test invalid strings that will never be tokenized as strings.
				835	#ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
				836	EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
				837	"passed text that could not have been tokenized as a string");
				838	#endif // PROTOBUF_HAS_DEATH_TEST
				839	}
				840
				841	TEST_F(TokenizerTest, ParseStringAppend) {
				842	// Check that ParseString and ParseStringAppend differ.
				843	string output("stuff+");
				844	Tokenizer::ParseStringAppend("'hello'", &output);
				845	EXPECT_EQ("stuff+hello", output);
				846	Tokenizer::ParseString("'hello'", &output);
				847	EXPECT_EQ("hello", output);
				848	}
				849
				850	// -------------------------------------------------------------------
				851
				852	// Each case parses some input text, ignoring the tokens produced, and
				853	// checks that the error output matches what is expected.
				854	struct ErrorCase {
				855	string input;
				856	bool recoverable; // True if the tokenizer should be able to recover and
				857	// parse more tokens after seeing this error. Cases
				858	// for which this is true must end with "foo" as
				859	// the last token, which the test will check for.
				860	const char* errors;
				861	};
				862
Austin Schuh	40c1652	2018-10-28 20:27:54 -0700	[diff] [blame^]	863	inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
Brian Silverman	9c614bc	2016-02-15 20:20:02 -0500	[diff] [blame]	864	return out << CEscape(test_case.input);
				865	}
				866
				867	ErrorCase kErrorCases[] = {
				868	// String errors.
				869	{ "'\\l' foo", true,
				870	"0:2: Invalid escape sequence in string literal.\n" },
				871	{ "'\\X' foo", true,
				872	"0:2: Invalid escape sequence in string literal.\n" },
				873	{ "'\\x' foo", true,
				874	"0:3: Expected hex digits for escape sequence.\n" },
				875	{ "'foo", false,
				876	"0:4: Unexpected end of string.\n" },
				877	{ "'bar\nfoo", true,
				878	"0:4: String literals cannot cross line boundaries.\n" },
				879	{ "'\\u01' foo", true,
				880	"0:5: Expected four hex digits for \\u escape sequence.\n" },
				881	{ "'\\u01' foo", true,
				882	"0:5: Expected four hex digits for \\u escape sequence.\n" },
				883	{ "'\\uXYZ' foo", true,
				884	"0:3: Expected four hex digits for \\u escape sequence.\n" },
				885
				886	// Integer errors.
				887	{ "123foo", true,
				888	"0:3: Need space between number and identifier.\n" },
				889
				890	// Hex/octal errors.
				891	{ "0x foo", true,
				892	"0:2: \"0x\" must be followed by hex digits.\n" },
				893	{ "0541823 foo", true,
				894	"0:4: Numbers starting with leading zero must be in octal.\n" },
				895	{ "0x123z foo", true,
				896	"0:5: Need space between number and identifier.\n" },
				897	{ "0x123.4 foo", true,
				898	"0:5: Hex and octal numbers must be integers.\n" },
				899	{ "0123.4 foo", true,
				900	"0:4: Hex and octal numbers must be integers.\n" },
				901
				902	// Float errors.
				903	{ "1e foo", true,
				904	"0:2: \"e\" must be followed by exponent.\n" },
				905	{ "1e- foo", true,
				906	"0:3: \"e\" must be followed by exponent.\n" },
				907	{ "1.2.3 foo", true,
				908	"0:3: Already saw decimal point or exponent; can't have another one.\n" },
				909	{ "1e2.3 foo", true,
				910	"0:3: Already saw decimal point or exponent; can't have another one.\n" },
				911	{ "a.1 foo", true,
				912	"0:1: Need space between identifier and decimal point.\n" },
				913	// allow_f_after_float not enabled, so this should be an error.
				914	{ "1.0f foo", true,
				915	"0:3: Need space between number and identifier.\n" },
				916
				917	// Block comment errors.
				918	{ "/*", false,
				919	"0:2: End-of-file inside block comment.\n"
				920	"0:0: Comment started here.\n"},
				921	{ "/// foo", true,
				922	"0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
				923
				924	// Control characters. Multiple consecutive control characters should only
				925	// produce one error.
				926	{ "\b foo", true,
				927	"0:0: Invalid control characters encountered in text.\n" },
				928	{ "\b\b foo", true,
				929	"0:0: Invalid control characters encountered in text.\n" },
				930
				931	// Check that control characters at end of input don't result in an
				932	// infinite loop.
				933	{ "\b", false,
				934	"0:0: Invalid control characters encountered in text.\n" },
				935
				936	// Check recovery from '\0'. We have to explicitly specify the length of
				937	// these strings because otherwise the string constructor will just call
				938	// strlen() which will see the first '\0' and think that is the end of the
				939	// string.
				940	{ string("\0foo", 4), true,
				941	"0:0: Invalid control characters encountered in text.\n" },
				942	{ string("\0\0foo", 5), true,
				943	"0:0: Invalid control characters encountered in text.\n" },
				944
				945	// Check error from high order bits set
				946	{ "\300foo", true,
				947	"0:0: Interpreting non ascii codepoint 192.\n" },
				948	};
				949
				950	TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
				951	// Set up the tokenizer.
				952	TestInputStream input(kErrorCases_case.input.data(),
				953	kErrorCases_case.input.size(),
				954	kBlockSizes_case);
				955	TestErrorCollector error_collector;
				956	Tokenizer tokenizer(&input, &error_collector);
				957
				958	// Ignore all input, except remember if the last token was "foo".
				959	bool last_was_foo = false;
				960	while (tokenizer.Next()) {
				961	last_was_foo = tokenizer.current().text == "foo";
				962	}
				963
				964	// Check that the errors match what was expected.
				965	EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
				966
				967	// If the error was recoverable, make sure we saw "foo" after it.
				968	if (kErrorCases_case.recoverable) {
				969	EXPECT_TRUE(last_was_foo);
				970	}
				971	}
				972
				973	// -------------------------------------------------------------------
				974
				975	TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
				976	string text = "foo bar";
				977	TestInputStream input(text.data(), text.size(), kBlockSizes_case);
				978
				979	// Create a tokenizer, read one token, then destroy it.
				980	{
				981	TestErrorCollector error_collector;
				982	Tokenizer tokenizer(&input, &error_collector);
				983
				984	tokenizer.Next();
				985	}
				986
				987	// Only "foo" should have been read.
				988	EXPECT_EQ(strlen("foo"), input.ByteCount());
				989	}
				990
				991
				992	} // namespace
				993	} // namespace io
				994	} // namespace protobuf
				995	} // namespace google