Blame - src/google/protobuf/io/tokenizer.cc - RealtimeRoboticsGroup/test

blob: 3d57707c127f021d390d23db3e0001cb728e840c [file] [log] [blame]

Brian Silverman	9c614bc	2016-02-15 20:20:02 -0500	[diff] [blame^]	1	// Protocol Buffers - Google's data interchange format
				2	// Copyright 2008 Google Inc. All rights reserved.
				3	// https://developers.google.com/protocol-buffers/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are
				7	// met:
				8	//
				9	// * Redistributions of source code must retain the above copyright
				10	// notice, this list of conditions and the following disclaimer.
				11	// * Redistributions in binary form must reproduce the above
				12	// copyright notice, this list of conditions and the following disclaimer
				13	// in the documentation and/or other materials provided with the
				14	// distribution.
				15	// * Neither the name of Google Inc. nor the names of its
				16	// contributors may be used to endorse or promote products derived from
				17	// this software without specific prior written permission.
				18	//
				19	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				20	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				21	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				22	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				23	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				24	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				25	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				26	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				27	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				28	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				29	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				30
				31	// Author: kenton@google.com (Kenton Varda)
				32	// Based on original Protocol Buffers design by
				33	// Sanjay Ghemawat, Jeff Dean, and others.
				34	//
				35	// Here we have a hand-written lexer. At first you might ask yourself,
				36	// "Hand-written text processing? Is Kenton crazy?!" Well, first of all,
				37	// yes I am crazy, but that's beside the point. There are actually reasons
				38	// why I ended up writing this this way.
				39	//
				40	// The traditional approach to lexing is to use lex to generate a lexer for
				41	// you. Unfortunately, lex's output is ridiculously ugly and difficult to
				42	// integrate cleanly with C++ code, especially abstract code or code meant
				43	// as a library. Better parser-generators exist but would add dependencies
				44	// which most users won't already have, which we'd like to avoid. (GNU flex
				45	// has a C++ output option, but it's still ridiculously ugly, non-abstract,
				46	// and not library-friendly.)
				47	//
				48	// The next approach that any good software engineer should look at is to
				49	// use regular expressions. And, indeed, I did. I have code which
				50	// implements this same class using regular expressions. It's about 200
				51	// lines shorter. However:
				52	// - Rather than error messages telling you "This string has an invalid
				53	// escape sequence at line 5, column 45", you get error messages like
				54	// "Parse error on line 5". Giving more precise errors requires adding
				55	// a lot of code that ends up basically as complex as the hand-coded
				56	// version anyway.
				57	// - The regular expression to match a string literal looks like this:
				58	// kString = new RE("(\"([^\"\\\\]\|" // non-escaped
				59	// "\\\\[abfnrtv?\"'\\\\0-7]\|" // normal escape
				60	// "\\\\x[0-9a-fA-F])*\"\|" // hex escape
				61	// "\'([^\'\\\\]\|" // Also support single-quotes.
				62	// "\\\\[abfnrtv?\"'\\\\0-7]\|"
				63	// "\\\\x[0-9a-fA-F])*\')");
				64	// Verifying the correctness of this line noise is actually harder than
				65	// verifying the correctness of ConsumeString(), defined below. I'm not
				66	// even confident that the above is correct, after staring at it for some
				67	// time.
				68	// - PCRE is fast, but there's still more overhead involved than the code
				69	// below.
				70	// - Sadly, regular expressions are not part of the C standard library, so
				71	// using them would require depending on some other library. For the
				72	// open source release, this could be really annoying. Nobody likes
				73	// downloading one piece of software just to find that they need to
				74	// download something else to make it work, and in all likelihood
				75	// people downloading Protocol Buffers will already be doing so just
				76	// to make something else work. We could include a copy of PCRE with
				77	// our code, but that obligates us to keep it up-to-date and just seems
				78	// like a big waste just to save 200 lines of code.
				79	//
				80	// On a similar but unrelated note, I'm even scared to use ctype.h.
				81	// Apparently functions like isalpha() are locale-dependent. So, if we used
				82	// that, then if this code is being called from some program that doesn't
				83	// have its locale set to "C", it would behave strangely. We can't just set
				84	// the locale to "C" ourselves since we might break the calling program that
				85	// way, particularly if it is multi-threaded. WTF? Someone please let me
				86	// (Kenton) know if I'm missing something here...
				87	//
				88	// I'd love to hear about other alternatives, though, as this code isn't
				89	// exactly pretty.
				90
				91	#include <google/protobuf/io/tokenizer.h>
				92	#include <google/protobuf/stubs/common.h>
				93	#include <google/protobuf/stubs/logging.h>
				94	#include <google/protobuf/stubs/stringprintf.h>
				95	#include <google/protobuf/io/strtod.h>
				96	#include <google/protobuf/io/zero_copy_stream.h>
				97	#include <google/protobuf/stubs/strutil.h>
				98	#include <google/protobuf/stubs/stl_util.h>
				99
				100	namespace google {
				101	namespace protobuf {
				102	namespace io {
				103	namespace {
				104
				105	// As mentioned above, I don't trust ctype.h due to the presence of "locales".
				106	// So, I have written replacement functions here. Someone please smack me if
				107	// this is a bad idea or if there is some way around this.
				108	//
				109	// These "character classes" are designed to be used in template methods.
				110	// For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
				111	// whitespace.
				112
				113	// Note: No class is allowed to contain '\0', since this is used to mark end-
				114	// of-input and is handled specially.
				115
				116	#define CHARACTER_CLASS(NAME, EXPRESSION) \
				117	class NAME { \
				118	public: \
				119	static inline bool InClass(char c) { \
				120	return EXPRESSION; \
				121	} \
				122	}
				123
				124	CHARACTER_CLASS(Whitespace, c == ' ' \|\| c == '\n' \|\| c == '\t' \|\|
				125	c == '\r' \|\| c == '\v' \|\| c == '\f');
				126	CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' \|\| c == '\t' \|\|
				127	c == '\r' \|\| c == '\v' \|\| c == '\f');
				128
				129	CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');
				130
				131	CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
				132	CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
				133	CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') \|\|
				134	('a' <= c && c <= 'f') \|\|
				135	('A' <= c && c <= 'F'));
				136
				137	CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') \|\|
				138	('A' <= c && c <= 'Z') \|\|
				139	(c == '_'));
				140
				141	CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') \|\|
				142	('A' <= c && c <= 'Z') \|\|
				143	('0' <= c && c <= '9') \|\|
				144	(c == '_'));
				145
				146	CHARACTER_CLASS(Escape, c == 'a' \|\| c == 'b' \|\| c == 'f' \|\| c == 'n' \|\|
				147	c == 'r' \|\| c == 't' \|\| c == 'v' \|\| c == '\\' \|\|
				148	c == '?' \|\| c == '\'' \|\| c == '\"');
				149
				150	#undef CHARACTER_CLASS
				151
				152	// Given a char, interpret it as a numeric digit and return its value.
				153	// This supports any number base up to 36.
				154	inline int DigitValue(char digit) {
				155	if ('0' <= digit && digit <= '9') return digit - '0';
				156	if ('a' <= digit && digit <= 'z') return digit - 'a' + 10;
				157	if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10;
				158	return -1;
				159	}
				160
				161	// Inline because it's only used in one place.
				162	inline char TranslateEscape(char c) {
				163	switch (c) {
				164	case 'a': return '\a';
				165	case 'b': return '\b';
				166	case 'f': return '\f';
				167	case 'n': return '\n';
				168	case 'r': return '\r';
				169	case 't': return '\t';
				170	case 'v': return '\v';
				171	case '\\': return '\\';
				172	case '?': return '\?'; // Trigraphs = :(
				173	case '\'': return '\'';
				174	case '"': return '\"';
				175
				176	// We expect escape sequences to have been validated separately.
				177	default: return '?';
				178	}
				179	}
				180
				181	} // anonymous namespace
				182
				183	ErrorCollector::~ErrorCollector() {}
				184
				185	// ===================================================================
				186
				187	Tokenizer::Tokenizer(ZeroCopyInputStream* input,
				188	ErrorCollector* error_collector)
				189	: input_(input),
				190	error_collector_(error_collector),
				191	buffer_(NULL),
				192	buffer_size_(0),
				193	buffer_pos_(0),
				194	read_error_(false),
				195	line_(0),
				196	column_(0),
				197	record_target_(NULL),
				198	record_start_(-1),
				199	allow_f_after_float_(false),
				200	comment_style_(CPP_COMMENT_STYLE),
				201	require_space_after_number_(true),
				202	allow_multiline_strings_(false) {
				203
				204	current_.line = 0;
				205	current_.column = 0;
				206	current_.end_column = 0;
				207	current_.type = TYPE_START;
				208
				209	Refresh();
				210	}
				211
				212	Tokenizer::~Tokenizer() {
				213	// If we had any buffer left unread, return it to the underlying stream
				214	// so that someone else can read it.
				215	if (buffer_size_ > buffer_pos_) {
				216	input_->BackUp(buffer_size_ - buffer_pos_);
				217	}
				218	}
				219
				220	// -------------------------------------------------------------------
				221	// Internal helpers.
				222
				223	void Tokenizer::NextChar() {
				224	// Update our line and column counters based on the character being
				225	// consumed.
				226	if (current_char_ == '\n') {
				227	++line_;
				228	column_ = 0;
				229	} else if (current_char_ == '\t') {
				230	column_ += kTabWidth - column_ % kTabWidth;
				231	} else {
				232	++column_;
				233	}
				234
				235	// Advance to the next character.
				236	++buffer_pos_;
				237	if (buffer_pos_ < buffer_size_) {
				238	current_char_ = buffer_[buffer_pos_];
				239	} else {
				240	Refresh();
				241	}
				242	}
				243
				244	void Tokenizer::Refresh() {
				245	if (read_error_) {
				246	current_char_ = '\0';
				247	return;
				248	}
				249
				250	// If we're in a token, append the rest of the buffer to it.
				251	if (record_target_ != NULL && record_start_ < buffer_size_) {
				252	record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);
				253	record_start_ = 0;
				254	}
				255
				256	const void* data = NULL;
				257	buffer_ = NULL;
				258	buffer_pos_ = 0;
				259	do {
				260	if (!input_->Next(&data, &buffer_size_)) {
				261	// end of stream (or read error)
				262	buffer_size_ = 0;
				263	read_error_ = true;
				264	current_char_ = '\0';
				265	return;
				266	}
				267	} while (buffer_size_ == 0);
				268
				269	buffer_ = static_cast<const char*>(data);
				270
				271	current_char_ = buffer_[0];
				272	}
				273
				274	inline void Tokenizer::RecordTo(string* target) {
				275	record_target_ = target;
				276	record_start_ = buffer_pos_;
				277	}
				278
				279	inline void Tokenizer::StopRecording() {
				280	// Note: The if() is necessary because some STL implementations crash when
				281	// you call string::append(NULL, 0), presumably because they are trying to
				282	// be helpful by detecting the NULL pointer, even though there's nothing
				283	// wrong with reading zero bytes from NULL.
				284	if (buffer_pos_ != record_start_) {
				285	record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);
				286	}
				287	record_target_ = NULL;
				288	record_start_ = -1;
				289	}
				290
				291	inline void Tokenizer::StartToken() {
				292	current_.type = TYPE_START; // Just for the sake of initializing it.
				293	current_.text.clear();
				294	current_.line = line_;
				295	current_.column = column_;
				296	RecordTo(&current_.text);
				297	}
				298
				299	inline void Tokenizer::EndToken() {
				300	StopRecording();
				301	current_.end_column = column_;
				302	}
				303
				304	// -------------------------------------------------------------------
				305	// Helper methods that consume characters.
				306
				307	template<typename CharacterClass>
				308	inline bool Tokenizer::LookingAt() {
				309	return CharacterClass::InClass(current_char_);
				310	}
				311
				312	template<typename CharacterClass>
				313	inline bool Tokenizer::TryConsumeOne() {
				314	if (CharacterClass::InClass(current_char_)) {
				315	NextChar();
				316	return true;
				317	} else {
				318	return false;
				319	}
				320	}
				321
				322	inline bool Tokenizer::TryConsume(char c) {
				323	if (current_char_ == c) {
				324	NextChar();
				325	return true;
				326	} else {
				327	return false;
				328	}
				329	}
				330
				331	template<typename CharacterClass>
				332	inline void Tokenizer::ConsumeZeroOrMore() {
				333	while (CharacterClass::InClass(current_char_)) {
				334	NextChar();
				335	}
				336	}
				337
				338	template<typename CharacterClass>
				339	inline void Tokenizer::ConsumeOneOrMore(const char* error) {
				340	if (!CharacterClass::InClass(current_char_)) {
				341	AddError(error);
				342	} else {
				343	do {
				344	NextChar();
				345	} while (CharacterClass::InClass(current_char_));
				346	}
				347	}
				348
				349	// -------------------------------------------------------------------
				350	// Methods that read whole patterns matching certain kinds of tokens
				351	// or comments.
				352
				353	void Tokenizer::ConsumeString(char delimiter) {
				354	while (true) {
				355	switch (current_char_) {
				356	case '\0':
				357	AddError("Unexpected end of string.");
				358	return;
				359
				360	case '\n': {
				361	if (!allow_multiline_strings_) {
				362	AddError("String literals cannot cross line boundaries.");
				363	return;
				364	}
				365	NextChar();
				366	break;
				367	}
				368
				369	case '\\': {
				370	// An escape sequence.
				371	NextChar();
				372	if (TryConsumeOne<Escape>()) {
				373	// Valid escape sequence.
				374	} else if (TryConsumeOne<OctalDigit>()) {
				375	// Possibly followed by two more octal digits, but these will
				376	// just be consumed by the main loop anyway so we don't need
				377	// to do so explicitly here.
				378	} else if (TryConsume('x')) {
				379	if (!TryConsumeOne<HexDigit>()) {
				380	AddError("Expected hex digits for escape sequence.");
				381	}
				382	// Possibly followed by another hex digit, but again we don't care.
				383	} else if (TryConsume('u')) {
				384	if (!TryConsumeOne<HexDigit>() \|\|
				385	!TryConsumeOne<HexDigit>() \|\|
				386	!TryConsumeOne<HexDigit>() \|\|
				387	!TryConsumeOne<HexDigit>()) {
				388	AddError("Expected four hex digits for \\u escape sequence.");
				389	}
				390	} else if (TryConsume('U')) {
				391	// We expect 8 hex digits; but only the range up to 0x10ffff is
				392	// legal.
				393	if (!TryConsume('0') \|\|
				394	!TryConsume('0') \|\|
				395	!(TryConsume('0') \|\| TryConsume('1')) \|\|
				396	!TryConsumeOne<HexDigit>() \|\|
				397	!TryConsumeOne<HexDigit>() \|\|
				398	!TryConsumeOne<HexDigit>() \|\|
				399	!TryConsumeOne<HexDigit>() \|\|
				400	!TryConsumeOne<HexDigit>()) {
				401	AddError("Expected eight hex digits up to 10ffff for \\U escape "
				402	"sequence");
				403	}
				404	} else {
				405	AddError("Invalid escape sequence in string literal.");
				406	}
				407	break;
				408	}
				409
				410	default: {
				411	if (current_char_ == delimiter) {
				412	NextChar();
				413	return;
				414	}
				415	NextChar();
				416	break;
				417	}
				418	}
				419	}
				420	}
				421
				422	Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
				423	bool started_with_dot) {
				424	bool is_float = false;
				425
				426	if (started_with_zero && (TryConsume('x') \|\| TryConsume('X'))) {
				427	// A hex number (started with "0x").
				428	ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
				429
				430	} else if (started_with_zero && LookingAt<Digit>()) {
				431	// An octal number (had a leading zero).
				432	ConsumeZeroOrMore<OctalDigit>();
				433	if (LookingAt<Digit>()) {
				434	AddError("Numbers starting with leading zero must be in octal.");
				435	ConsumeZeroOrMore<Digit>();
				436	}
				437
				438	} else {
				439	// A decimal number.
				440	if (started_with_dot) {
				441	is_float = true;
				442	ConsumeZeroOrMore<Digit>();
				443	} else {
				444	ConsumeZeroOrMore<Digit>();
				445
				446	if (TryConsume('.')) {
				447	is_float = true;
				448	ConsumeZeroOrMore<Digit>();
				449	}
				450	}
				451
				452	if (TryConsume('e') \|\| TryConsume('E')) {
				453	is_float = true;
				454	TryConsume('-') \|\| TryConsume('+');
				455	ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
				456	}
				457
				458	if (allow_f_after_float_ && (TryConsume('f') \|\| TryConsume('F'))) {
				459	is_float = true;
				460	}
				461	}
				462
				463	if (LookingAt<Letter>() && require_space_after_number_) {
				464	AddError("Need space between number and identifier.");
				465	} else if (current_char_ == '.') {
				466	if (is_float) {
				467	AddError(
				468	"Already saw decimal point or exponent; can't have another one.");
				469	} else {
				470	AddError("Hex and octal numbers must be integers.");
				471	}
				472	}
				473
				474	return is_float ? TYPE_FLOAT : TYPE_INTEGER;
				475	}
				476
				477	void Tokenizer::ConsumeLineComment(string* content) {
				478	if (content != NULL) RecordTo(content);
				479
				480	while (current_char_ != '\0' && current_char_ != '\n') {
				481	NextChar();
				482	}
				483	TryConsume('\n');
				484
				485	if (content != NULL) StopRecording();
				486	}
				487
				488	void Tokenizer::ConsumeBlockComment(string* content) {
				489	int start_line = line_;
				490	int start_column = column_ - 2;
				491
				492	if (content != NULL) RecordTo(content);
				493
				494	while (true) {
				495	while (current_char_ != '\0' &&
				496	current_char_ != '*' &&
				497	current_char_ != '/' &&
				498	current_char_ != '\n') {
				499	NextChar();
				500	}
				501
				502	if (TryConsume('\n')) {
				503	if (content != NULL) StopRecording();
				504
				505	// Consume leading whitespace and asterisk;
				506	ConsumeZeroOrMore<WhitespaceNoNewline>();
				507	if (TryConsume('*')) {
				508	if (TryConsume('/')) {
				509	// End of comment.
				510	break;
				511	}
				512	}
				513
				514	if (content != NULL) RecordTo(content);
				515	} else if (TryConsume('*') && TryConsume('/')) {
				516	// End of comment.
				517	if (content != NULL) {
				518	StopRecording();
				519	// Strip trailing "*/".
				520	content->erase(content->size() - 2);
				521	}
				522	break;
				523	} else if (TryConsume('/') && current_char_ == '*') {
				524	// Note: We didn't consume the '*' because if there is a '/' after it
				525	// we want to interpret that as the end of the comment.
				526	AddError(
				527	"\"/*\" inside block comment. Block comments cannot be nested.");
				528	} else if (current_char_ == '\0') {
				529	AddError("End-of-file inside block comment.");
				530	error_collector_->AddError(
				531	start_line, start_column, " Comment started here.");
				532	if (content != NULL) StopRecording();
				533	break;
				534	}
				535	}
				536	}
				537
				538	Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
				539	if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
				540	if (TryConsume('/')) {
				541	return LINE_COMMENT;
				542	} else if (TryConsume('*')) {
				543	return BLOCK_COMMENT;
				544	} else {
				545	// Oops, it was just a slash. Return it.
				546	current_.type = TYPE_SYMBOL;
				547	current_.text = "/";
				548	current_.line = line_;
				549	current_.column = column_ - 1;
				550	current_.end_column = column_;
				551	return SLASH_NOT_COMMENT;
				552	}
				553	} else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
				554	return LINE_COMMENT;
				555	} else {
				556	return NO_COMMENT;
				557	}
				558	}
				559
				560	// -------------------------------------------------------------------
				561
				562	bool Tokenizer::Next() {
				563	previous_ = current_;
				564
				565	while (!read_error_) {
				566	ConsumeZeroOrMore<Whitespace>();
				567
				568	switch (TryConsumeCommentStart()) {
				569	case LINE_COMMENT:
				570	ConsumeLineComment(NULL);
				571	continue;
				572	case BLOCK_COMMENT:
				573	ConsumeBlockComment(NULL);
				574	continue;
				575	case SLASH_NOT_COMMENT:
				576	return true;
				577	case NO_COMMENT:
				578	break;
				579	}
				580
				581	// Check for EOF before continuing.
				582	if (read_error_) break;
				583
				584	if (LookingAt<Unprintable>() \|\| current_char_ == '\0') {
				585	AddError("Invalid control characters encountered in text.");
				586	NextChar();
				587	// Skip more unprintable characters, too. But, remember that '\0' is
				588	// also what current_char_ is set to after EOF / read error. We have
				589	// to be careful not to go into an infinite loop of trying to consume
				590	// it, so make sure to check read_error_ explicitly before consuming
				591	// '\0'.
				592	while (TryConsumeOne<Unprintable>() \|\|
				593	(!read_error_ && TryConsume('\0'))) {
				594	// Ignore.
				595	}
				596
				597	} else {
				598	// Reading some sort of token.
				599	StartToken();
				600
				601	if (TryConsumeOne<Letter>()) {
				602	ConsumeZeroOrMore<Alphanumeric>();
				603	current_.type = TYPE_IDENTIFIER;
				604	} else if (TryConsume('0')) {
				605	current_.type = ConsumeNumber(true, false);
				606	} else if (TryConsume('.')) {
				607	// This could be the beginning of a floating-point number, or it could
				608	// just be a '.' symbol.
				609
				610	if (TryConsumeOne<Digit>()) {
				611	// It's a floating-point number.
				612	if (previous_.type == TYPE_IDENTIFIER &&
				613	current_.line == previous_.line &&
				614	current_.column == previous_.end_column) {
				615	// We don't accept syntax like "blah.123".
				616	error_collector_->AddError(line_, column_ - 2,
				617	"Need space between identifier and decimal point.");
				618	}
				619	current_.type = ConsumeNumber(false, true);
				620	} else {
				621	current_.type = TYPE_SYMBOL;
				622	}
				623	} else if (TryConsumeOne<Digit>()) {
				624	current_.type = ConsumeNumber(false, false);
				625	} else if (TryConsume('\"')) {
				626	ConsumeString('\"');
				627	current_.type = TYPE_STRING;
				628	} else if (TryConsume('\'')) {
				629	ConsumeString('\'');
				630	current_.type = TYPE_STRING;
				631	} else {
				632	// Check if the high order bit is set.
				633	if (current_char_ & 0x80) {
				634	error_collector_->AddError(line_, column_,
				635	StringPrintf("Interpreting non ascii codepoint %d.",
				636	static_cast<unsigned char>(current_char_)));
				637	}
				638	NextChar();
				639	current_.type = TYPE_SYMBOL;
				640	}
				641
				642	EndToken();
				643	return true;
				644	}
				645	}
				646
				647	// EOF
				648	current_.type = TYPE_END;
				649	current_.text.clear();
				650	current_.line = line_;
				651	current_.column = column_;
				652	current_.end_column = column_;
				653	return false;
				654	}
				655
				656	namespace {
				657
				658	// Helper class for collecting comments and putting them in the right places.
				659	//
				660	// This basically just buffers the most recent comment until it can be decided
				661	// exactly where that comment should be placed. When Flush() is called, the
				662	// current comment goes into either prev_trailing_comments or detached_comments.
				663	// When the CommentCollector is destroyed, the last buffered comment goes into
				664	// next_leading_comments.
				665	class CommentCollector {
				666	public:
				667	CommentCollector(string* prev_trailing_comments,
				668	vector<string>* detached_comments,
				669	string* next_leading_comments)
				670	: prev_trailing_comments_(prev_trailing_comments),
				671	detached_comments_(detached_comments),
				672	next_leading_comments_(next_leading_comments),
				673	has_comment_(false),
				674	is_line_comment_(false),
				675	can_attach_to_prev_(true) {
				676	if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
				677	if (detached_comments != NULL) detached_comments->clear();
				678	if (next_leading_comments != NULL) next_leading_comments->clear();
				679	}
				680
				681	~CommentCollector() {
				682	// Whatever is in the buffer is a leading comment.
				683	if (next_leading_comments_ != NULL && has_comment_) {
				684	comment_buffer_.swap(*next_leading_comments_);
				685	}
				686	}
				687
				688	// About to read a line comment. Get the comment buffer pointer in order to
				689	// read into it.
				690	string* GetBufferForLineComment() {
				691	// We want to combine with previous line comments, but not block comments.
				692	if (has_comment_ && !is_line_comment_) {
				693	Flush();
				694	}
				695	has_comment_ = true;
				696	is_line_comment_ = true;
				697	return &comment_buffer_;
				698	}
				699
				700	// About to read a block comment. Get the comment buffer pointer in order to
				701	// read into it.
				702	string* GetBufferForBlockComment() {
				703	if (has_comment_) {
				704	Flush();
				705	}
				706	has_comment_ = true;
				707	is_line_comment_ = false;
				708	return &comment_buffer_;
				709	}
				710
				711	void ClearBuffer() {
				712	comment_buffer_.clear();
				713	has_comment_ = false;
				714	}
				715
				716	// Called once we know that the comment buffer is complete and is not
				717	// connected to the next token.
				718	void Flush() {
				719	if (has_comment_) {
				720	if (can_attach_to_prev_) {
				721	if (prev_trailing_comments_ != NULL) {
				722	prev_trailing_comments_->append(comment_buffer_);
				723	}
				724	can_attach_to_prev_ = false;
				725	} else {
				726	if (detached_comments_ != NULL) {
				727	detached_comments_->push_back(comment_buffer_);
				728	}
				729	}
				730	ClearBuffer();
				731	}
				732	}
				733
				734	void DetachFromPrev() {
				735	can_attach_to_prev_ = false;
				736	}
				737
				738	private:
				739	string* prev_trailing_comments_;
				740	vector<string>* detached_comments_;
				741	string* next_leading_comments_;
				742
				743	string comment_buffer_;
				744
				745	// True if any comments were read into comment_buffer_. This can be true even
				746	// if comment_buffer_ is empty, namely if the comment was "/**/".
				747	bool has_comment_;
				748
				749	// Is the comment in the comment buffer a line comment?
				750	bool is_line_comment_;
				751
				752	// Is it still possible that we could be reading a comment attached to the
				753	// previous token?
				754	bool can_attach_to_prev_;
				755	};
				756
				757	} // namespace
				758
				759	bool Tokenizer::NextWithComments(string* prev_trailing_comments,
				760	vector<string>* detached_comments,
				761	string* next_leading_comments) {
				762	CommentCollector collector(prev_trailing_comments, detached_comments,
				763	next_leading_comments);
				764
				765	if (current_.type == TYPE_START) {
				766	// Ignore unicode byte order mark(BOM) if it appears at the file
				767	// beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
				768	if (TryConsume((char)0xEF)) {
				769	if (!TryConsume((char)0xBB) \|\| !TryConsume((char)0xBF)) {
				770	AddError("Proto file starts with 0xEF but not UTF-8 BOM. "
				771	"Only UTF-8 is accepted for proto file.");
				772	return false;
				773	}
				774	}
				775	collector.DetachFromPrev();
				776	} else {
				777	// A comment appearing on the same line must be attached to the previous
				778	// declaration.
				779	ConsumeZeroOrMore<WhitespaceNoNewline>();
				780	switch (TryConsumeCommentStart()) {
				781	case LINE_COMMENT:
				782	ConsumeLineComment(collector.GetBufferForLineComment());
				783
				784	// Don't allow comments on subsequent lines to be attached to a trailing
				785	// comment.
				786	collector.Flush();
				787	break;
				788	case BLOCK_COMMENT:
				789	ConsumeBlockComment(collector.GetBufferForBlockComment());
				790
				791	ConsumeZeroOrMore<WhitespaceNoNewline>();
				792	if (!TryConsume('\n')) {
				793	// Oops, the next token is on the same line. If we recorded a comment
				794	// we really have no idea which token it should be attached to.
				795	collector.ClearBuffer();
				796	return Next();
				797	}
				798
				799	// Don't allow comments on subsequent lines to be attached to a trailing
				800	// comment.
				801	collector.Flush();
				802	break;
				803	case SLASH_NOT_COMMENT:
				804	return true;
				805	case NO_COMMENT:
				806	if (!TryConsume('\n')) {
				807	// The next token is on the same line. There are no comments.
				808	return Next();
				809	}
				810	break;
				811	}
				812	}
				813
				814	// OK, we are now on the line after the previous token.
				815	while (true) {
				816	ConsumeZeroOrMore<WhitespaceNoNewline>();
				817
				818	switch (TryConsumeCommentStart()) {
				819	case LINE_COMMENT:
				820	ConsumeLineComment(collector.GetBufferForLineComment());
				821	break;
				822	case BLOCK_COMMENT:
				823	ConsumeBlockComment(collector.GetBufferForBlockComment());
				824
				825	// Consume the rest of the line so that we don't interpret it as a
				826	// blank line the next time around the loop.
				827	ConsumeZeroOrMore<WhitespaceNoNewline>();
				828	TryConsume('\n');
				829	break;
				830	case SLASH_NOT_COMMENT:
				831	return true;
				832	case NO_COMMENT:
				833	if (TryConsume('\n')) {
				834	// Completely blank line.
				835	collector.Flush();
				836	collector.DetachFromPrev();
				837	} else {
				838	bool result = Next();
				839	if (!result \|\|
				840	current_.text == "}" \|\|
				841	current_.text == "]" \|\|
				842	current_.text == ")") {
				843	// It looks like we're at the end of a scope. In this case it
				844	// makes no sense to attach a comment to the following token.
				845	collector.Flush();
				846	}
				847	return result;
				848	}
				849	break;
				850	}
				851	}
				852	}
				853
				854	// -------------------------------------------------------------------
				855	// Token-parsing helpers. Remember that these don't need to report
				856	// errors since any errors should already have been reported while
				857	// tokenizing. Also, these can assume that whatever text they
				858	// are given is text that the tokenizer actually parsed as a token
				859	// of the given type.
				860
				861	bool Tokenizer::ParseInteger(const string& text, uint64 max_value,
				862	uint64* output) {
				863	// Sadly, we can't just use strtoul() since it is only 32-bit and strtoull()
				864	// is non-standard. I hate the C standard library. :(
				865
				866	// return strtoull(text.c_str(), NULL, 0);
				867
				868	const char* ptr = text.c_str();
				869	int base = 10;
				870	if (ptr[0] == '0') {
				871	if (ptr[1] == 'x' \|\| ptr[1] == 'X') {
				872	// This is hex.
				873	base = 16;
				874	ptr += 2;
				875	} else {
				876	// This is octal.
				877	base = 8;
				878	}
				879	}
				880
				881	uint64 result = 0;
				882	for (; *ptr != '\0'; ptr++) {
				883	int digit = DigitValue(*ptr);
				884	GOOGLE_LOG_IF(DFATAL, digit < 0 \|\| digit >= base)
				885	<< " Tokenizer::ParseInteger() passed text that could not have been"
				886	" tokenized as an integer: " << CEscape(text);
				887	if (digit > max_value \|\| result > (max_value - digit) / base) {
				888	// Overflow.
				889	return false;
				890	}
				891	result = result * base + digit;
				892	}
				893
				894	*output = result;
				895	return true;
				896	}
				897
				898	double Tokenizer::ParseFloat(const string& text) {
				899	const char* start = text.c_str();
				900	char* end;
				901	double result = NoLocaleStrtod(start, &end);
				902
				903	// "1e" is not a valid float, but if the tokenizer reads it, it will
				904	// report an error but still return it as a valid token. We need to
				905	// accept anything the tokenizer could possibly return, error or not.
				906	if (end == 'e' \|\| end == 'E') {
				907	++end;
				908	if (end == '-' \|\| end == '+') ++end;
				909	}
				910
				911	// If the Tokenizer had allow_f_after_float_ enabled, the float may be
				912	// suffixed with the letter 'f'.
				913	if (end == 'f' \|\| end == 'F') {
				914	++end;
				915	}
				916
				917	GOOGLE_LOG_IF(DFATAL, end - start != text.size() \|\| *start == '-')
				918	<< " Tokenizer::ParseFloat() passed text that could not have been"
				919	" tokenized as a float: " << CEscape(text);
				920	return result;
				921	}
				922
				923	// Helper to append a Unicode code point to a string as UTF8, without bringing
				924	// in any external dependencies.
				925	static void AppendUTF8(uint32 code_point, string* output) {
				926	uint32 tmp = 0;
				927	int len = 0;
				928	if (code_point <= 0x7f) {
				929	tmp = code_point;
				930	len = 1;
				931	} else if (code_point <= 0x07ff) {
				932	tmp = 0x0000c080 \|
				933	((code_point & 0x07c0) << 2) \|
				934	(code_point & 0x003f);
				935	len = 2;
				936	} else if (code_point <= 0xffff) {
				937	tmp = 0x00e08080 \|
				938	((code_point & 0xf000) << 4) \|
				939	((code_point & 0x0fc0) << 2) \|
				940	(code_point & 0x003f);
				941	len = 3;
				942	} else if (code_point <= 0x1fffff) {
				943	tmp = 0xf0808080 \|
				944	((code_point & 0x1c0000) << 6) \|
				945	((code_point & 0x03f000) << 4) \|
				946	((code_point & 0x000fc0) << 2) \|
				947	(code_point & 0x003f);
				948	len = 4;
				949	} else {
				950	// UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
				951	// normally only defined up to there as well.
				952	StringAppendF(output, "\\U%08x", code_point);
				953	return;
				954	}
				955	tmp = ghtonl(tmp);
				956	output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
				957	}
				958
				959	// Try to read <len> hex digits from ptr, and stuff the numeric result into
				960	// *result. Returns true if that many digits were successfully consumed.
				961	static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
				962	*result = 0;
				963	if (len == 0) return false;
				964	for (const char* end = ptr + len; ptr < end; ++ptr) {
				965	if (*ptr == '\0') return false;
				966	result = (result << 4) + DigitValue(*ptr);
				967	}
				968	return true;
				969	}
				970
				971	// Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
				972	// 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
				973	// surrogate. These numbers are in a reserved range of Unicode code points, so
				974	// if we encounter such a pair we know how to parse it and convert it into a
				975	// single code point.
				976	static const uint32 kMinHeadSurrogate = 0xd800;
				977	static const uint32 kMaxHeadSurrogate = 0xdc00;
				978	static const uint32 kMinTrailSurrogate = 0xdc00;
				979	static const uint32 kMaxTrailSurrogate = 0xe000;
				980
				981	static inline bool IsHeadSurrogate(uint32 code_point) {
				982	return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
				983	}
				984
				985	static inline bool IsTrailSurrogate(uint32 code_point) {
				986	return (code_point >= kMinTrailSurrogate) &&
				987	(code_point < kMaxTrailSurrogate);
				988	}
				989
				990	// Combine a head and trail surrogate into a single Unicode code point.
				991	static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
				992	GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
				993	GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
				994	return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) \|
				995	(trail_surrogate - kMinTrailSurrogate));
				996	}
				997
				998	// Convert the escape sequence parameter to a number of expected hex digits.
				999	static inline int UnicodeLength(char key) {
				1000	if (key == 'u') return 4;
				1001	if (key == 'U') return 8;
				1002	return 0;
				1003	}
				1004
				1005	// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
				1006	// to parse that sequence. On success, returns a pointer to the first char
				1007	// beyond that sequence, and fills in *code_point. On failure, returns ptr
				1008	// itself.
				1009	static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
				1010	const char* p = ptr;
				1011	// Fetch the code point.
				1012	const int len = UnicodeLength(*p++);
				1013	if (!ReadHexDigits(p, len, code_point))
				1014	return ptr;
				1015	p += len;
				1016
				1017	// Check if the code point we read is a "head surrogate." If so, then we
				1018	// expect it to be immediately followed by another code point which is a valid
				1019	// "trail surrogate," and together they form a UTF-16 pair which decodes into
				1020	// a single Unicode point. Trail surrogates may only use \u, not \U.
				1021	if (IsHeadSurrogate(code_point) && p == '\\' && *(p + 1) == 'u') {
				1022	uint32 trail_surrogate;
				1023	if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
				1024	IsTrailSurrogate(trail_surrogate)) {
				1025	code_point = AssembleUTF16(code_point, trail_surrogate);
				1026	p += 6;
				1027	}
				1028	// If this failed, then we just emit the head surrogate as a code point.
				1029	// It's bogus, but so is the string.
				1030	}
				1031
				1032	return p;
				1033	}
				1034
				1035	// The text string must begin and end with single or double quote
				1036	// characters.
				1037	void Tokenizer::ParseStringAppend(const string& text, string* output) {
				1038	// Reminder: text[0] is always a quote character. (If text is
				1039	// empty, it's invalid, so we'll just return).
				1040	const size_t text_size = text.size();
				1041	if (text_size == 0) {
				1042	GOOGLE_LOG(DFATAL)
				1043	<< " Tokenizer::ParseStringAppend() passed text that could not"
				1044	" have been tokenized as a string: " << CEscape(text);
				1045	return;
				1046	}
				1047
				1048	// Reserve room for new string. The branch is necessary because if
				1049	// there is already space available the reserve() call might
				1050	// downsize the output.
				1051	const size_t new_len = text_size + output->size();
				1052	if (new_len > output->capacity()) {
				1053	output->reserve(new_len);
				1054	}
				1055
				1056	// Loop through the string copying characters to "output" and
				1057	// interpreting escape sequences. Note that any invalid escape
				1058	// sequences or other errors were already reported while tokenizing.
				1059	// In this case we do not need to produce valid results.
				1060	for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
				1061	if (*ptr == '\\' && ptr[1] != '\0') {
				1062	// An escape sequence.
				1063	++ptr;
				1064
				1065	if (OctalDigit::InClass(*ptr)) {
				1066	// An octal escape. May one, two, or three digits.
				1067	int code = DigitValue(*ptr);
				1068	if (OctalDigit::InClass(ptr[1])) {
				1069	++ptr;
				1070	code = code * 8 + DigitValue(*ptr);
				1071	}
				1072	if (OctalDigit::InClass(ptr[1])) {
				1073	++ptr;
				1074	code = code * 8 + DigitValue(*ptr);
				1075	}
				1076	output->push_back(static_cast<char>(code));
				1077
				1078	} else if (*ptr == 'x') {
				1079	// A hex escape. May zero, one, or two digits. (The zero case
				1080	// will have been caught as an error earlier.)
				1081	int code = 0;
				1082	if (HexDigit::InClass(ptr[1])) {
				1083	++ptr;
				1084	code = DigitValue(*ptr);
				1085	}
				1086	if (HexDigit::InClass(ptr[1])) {
				1087	++ptr;
				1088	code = code * 16 + DigitValue(*ptr);
				1089	}
				1090	output->push_back(static_cast<char>(code));
				1091
				1092	} else if (ptr == 'u' \|\| ptr == 'U') {
				1093	uint32 unicode;
				1094	const char* end = FetchUnicodePoint(ptr, &unicode);
				1095	if (end == ptr) {
				1096	// Failure: Just dump out what we saw, don't try to parse it.
				1097	output->push_back(*ptr);
				1098	} else {
				1099	AppendUTF8(unicode, output);
				1100	ptr = end - 1; // Because we're about to ++ptr.
				1101	}
				1102	} else {
				1103	// Some other escape code.
				1104	output->push_back(TranslateEscape(*ptr));
				1105	}
				1106
				1107	} else if (*ptr == text[0] && ptr[1] == '\0') {
				1108	// Ignore final quote matching the starting quote.
				1109	} else {
				1110	output->push_back(*ptr);
				1111	}
				1112	}
				1113	}
				1114
				1115	template<typename CharacterClass>
				1116	static bool AllInClass(const string& s) {
				1117	for (int i = 0; i < s.size(); ++i) {
				1118	if (!CharacterClass::InClass(s[i]))
				1119	return false;
				1120	}
				1121	return true;
				1122	}
				1123
				1124	bool Tokenizer::IsIdentifier(const string& text) {
				1125	// Mirrors IDENTIFIER definition in Tokenizer::Next() above.
				1126	if (text.size() == 0)
				1127	return false;
				1128	if (!Letter::InClass(text.at(0)))
				1129	return false;
				1130	if (!AllInClass<Alphanumeric>(text.substr(1)))
				1131	return false;
				1132	return true;
				1133	}
				1134
				1135	} // namespace io
				1136	} // namespace protobuf
				1137	} // namespace google