Blame - src/google/protobuf/io/tokenizer.h - RealtimeRoboticsGroup/test

blob: 49885eda9c6d04def8cf3e31699bfa78ddded534 [file] [log] [blame]

Brian Silverman	9c614bc	2016-02-15 20:20:02 -0500	[diff] [blame^]	1	// Protocol Buffers - Google's data interchange format
				2	// Copyright 2008 Google Inc. All rights reserved.
				3	// https://developers.google.com/protocol-buffers/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are
				7	// met:
				8	//
				9	// * Redistributions of source code must retain the above copyright
				10	// notice, this list of conditions and the following disclaimer.
				11	// * Redistributions in binary form must reproduce the above
				12	// copyright notice, this list of conditions and the following disclaimer
				13	// in the documentation and/or other materials provided with the
				14	// distribution.
				15	// * Neither the name of Google Inc. nor the names of its
				16	// contributors may be used to endorse or promote products derived from
				17	// this software without specific prior written permission.
				18	//
				19	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				20	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				21	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				22	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				23	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				24	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				25	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				26	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				27	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				28	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				29	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				30
				31	// Author: kenton@google.com (Kenton Varda)
				32	// Based on original Protocol Buffers design by
				33	// Sanjay Ghemawat, Jeff Dean, and others.
				34	//
				35	// Class for parsing tokenized text from a ZeroCopyInputStream.
				36
				37	#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
				38	#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
				39
				40	#include <string>
				41	#include <vector>
				42	#include <google/protobuf/stubs/common.h>
				43	#include <google/protobuf/stubs/logging.h>
				44
				45	namespace google {
				46	namespace protobuf {
				47	namespace io {
				48
				49	class ZeroCopyInputStream; // zero_copy_stream.h
				50
				51	// Defined in this file.
				52	class ErrorCollector;
				53	class Tokenizer;
				54
				55	// Abstract interface for an object which collects the errors that occur
				56	// during parsing. A typical implementation might simply print the errors
				57	// to stdout.
				58	class LIBPROTOBUF_EXPORT ErrorCollector {
				59	public:
				60	inline ErrorCollector() {}
				61	virtual ~ErrorCollector();
				62
				63	// Indicates that there was an error in the input at the given line and
				64	// column numbers. The numbers are zero-based, so you may want to add
				65	// 1 to each before printing them.
				66	virtual void AddError(int line, int column, const string& message) = 0;
				67
				68	// Indicates that there was a warning in the input at the given line and
				69	// column numbers. The numbers are zero-based, so you may want to add
				70	// 1 to each before printing them.
				71	virtual void AddWarning(int /* line /, int / column */,
				72	const string& /* message */) { }
				73
				74	private:
				75	GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
				76	};
				77
				78	// This class converts a stream of raw text into a stream of tokens for
				79	// the protocol definition parser to parse. The tokens recognized are
				80	// similar to those that make up the C language; see the TokenType enum for
				81	// precise descriptions. Whitespace and comments are skipped. By default,
				82	// C- and C++-style comments are recognized, but other styles can be used by
				83	// calling set_comment_style().
				84	class LIBPROTOBUF_EXPORT Tokenizer {
				85	public:
				86	// Construct a Tokenizer that reads and tokenizes text from the given
				87	// input stream and writes errors to the given error_collector.
				88	// The caller keeps ownership of input and error_collector.
				89	Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
				90	~Tokenizer();
				91
				92	enum TokenType {
				93	TYPE_START, // Next() has not yet been called.
				94	TYPE_END, // End of input reached. "text" is empty.
				95
				96	TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not
				97	// starting with a digit. It is an error for a number
				98	// to be followed by an identifier with no space in
				99	// between.
				100	TYPE_INTEGER, // A sequence of digits representing an integer. Normally
				101	// the digits are decimal, but a prefix of "0x" indicates
				102	// a hex number and a leading zero indicates octal, just
				103	// like with C numeric literals. A leading negative sign
				104	// is NOT included in the token; it's up to the parser to
				105	// interpret the unary minus operator on its own.
				106	TYPE_FLOAT, // A floating point literal, with a fractional part and/or
				107	// an exponent. Always in decimal. Again, never
				108	// negative.
				109	TYPE_STRING, // A quoted sequence of escaped characters. Either single
				110	// or double quotes can be used, but they must match.
				111	// A string literal cannot cross a line break.
				112	TYPE_SYMBOL, // Any other printable character, like '!' or '+'.
				113	// Symbols are always a single character, so "!+$%" is
				114	// four tokens.
				115	};
				116
				117	// Structure representing a token read from the token stream.
				118	struct Token {
				119	TokenType type;
				120	string text; // The exact text of the token as it appeared in
				121	// the input. e.g. tokens of TYPE_STRING will still
				122	// be escaped and in quotes.
				123
				124	// "line" and "column" specify the position of the first character of
				125	// the token within the input stream. They are zero-based.
				126	int line;
				127	int column;
				128	int end_column;
				129	};
				130
				131	// Get the current token. This is updated when Next() is called. Before
				132	// the first call to Next(), current() has type TYPE_START and no contents.
				133	const Token& current();
				134
				135	// Return the previous token -- i.e. what current() returned before the
				136	// previous call to Next().
				137	const Token& previous();
				138
				139	// Advance to the next token. Returns false if the end of the input is
				140	// reached.
				141	bool Next();
				142
				143	// Like Next(), but also collects comments which appear between the previous
				144	// and next tokens.
				145	//
				146	// Comments which appear to be attached to the previous token are stored
				147	// in *prev_tailing_comments. Comments which appear to be attached to the
				148	// next token are stored in *next_leading_comments. Comments appearing in
				149	// between which do not appear to be attached to either will be added to
				150	// detached_comments. Any of these parameters can be NULL to simply discard
				151	// the comments.
				152	//
				153	// A series of line comments appearing on consecutive lines, with no other
				154	// tokens appearing on those lines, will be treated as a single comment.
				155	//
				156	// Only the comment content is returned; comment markers (e.g. //) are
				157	// stripped out. For block comments, leading whitespace and an asterisk will
				158	// be stripped from the beginning of each line other than the first. Newlines
				159	// are included in the output.
				160	//
				161	// Examples:
				162	//
				163	// optional int32 foo = 1; // Comment attached to foo.
				164	// // Comment attached to bar.
				165	// optional int32 bar = 2;
				166	//
				167	// optional string baz = 3;
				168	// // Comment attached to baz.
				169	// // Another line attached to baz.
				170	//
				171	// // Comment attached to qux.
				172	// //
				173	// // Another line attached to qux.
				174	// optional double qux = 4;
				175	//
				176	// // Detached comment. This is not attached to qux or corge
				177	// // because there are blank lines separating it from both.
				178	//
				179	// optional string corge = 5;
				180	// /* Block comment attached
				181	// * to corge. Leading asterisks
				182	// * will be removed. */
				183	// /* Block comment attached to
				184	// * grault. */
				185	// optional int32 grault = 6;
				186	bool NextWithComments(string* prev_trailing_comments,
				187	vector<string>* detached_comments,
				188	string* next_leading_comments);
				189
				190	// Parse helpers ---------------------------------------------------
				191
				192	// Parses a TYPE_FLOAT token. This never fails, so long as the text actually
				193	// comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
				194	// result is undefined (possibly an assert failure).
				195	static double ParseFloat(const string& text);
				196
				197	// Parses a TYPE_STRING token. This never fails, so long as the text actually
				198	// comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
				199	// result is undefined (possibly an assert failure).
				200	static void ParseString(const string& text, string* output);
				201
				202	// Identical to ParseString, but appends to output.
				203	static void ParseStringAppend(const string& text, string* output);
				204
				205	// Parses a TYPE_INTEGER token. Returns false if the result would be
				206	// greater than max_value. Otherwise, returns true and sets *output to the
				207	// result. If the text is not from a Token of type TYPE_INTEGER originally
				208	// parsed by a Tokenizer, the result is undefined (possibly an assert
				209	// failure).
				210	static bool ParseInteger(const string& text, uint64 max_value,
				211	uint64* output);
				212
				213	// Options ---------------------------------------------------------
				214
				215	// Set true to allow floats to be suffixed with the letter 'f'. Tokens
				216	// which would otherwise be integers but which have the 'f' suffix will be
				217	// forced to be interpreted as floats. For all other purposes, the 'f' is
				218	// ignored.
				219	void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
				220
				221	// Valid values for set_comment_style().
				222	enum CommentStyle {
				223	// Line comments begin with "//", block comments are delimited by "/*" and
				224	// "*/".
				225	CPP_COMMENT_STYLE,
				226	// Line comments begin with "#". No way to write block comments.
				227	SH_COMMENT_STYLE
				228	};
				229
				230	// Sets the comment style.
				231	void set_comment_style(CommentStyle style) { comment_style_ = style; }
				232
				233	// Whether to require whitespace between a number and a field name.
				234	// Default is true. Do not use this; for Google-internal cleanup only.
				235	void set_require_space_after_number(bool require) {
				236	require_space_after_number_ = require;
				237	}
				238
				239	// Whether to allow string literals to span multiple lines. Default is false.
				240	// Do not use this; for Google-internal cleanup only.
				241	void set_allow_multiline_strings(bool allow) {
				242	allow_multiline_strings_ = allow;
				243	}
				244
				245	// External helper: validate an identifier.
				246	static bool IsIdentifier(const string& text);
				247
				248	// -----------------------------------------------------------------
				249	private:
				250	GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
				251
				252	Token current_; // Returned by current().
				253	Token previous_; // Returned by previous().
				254
				255	ZeroCopyInputStream* input_;
				256	ErrorCollector* error_collector_;
				257
				258	char current_char_; // == buffer_[buffer_pos_], updated by NextChar().
				259	const char* buffer_; // Current buffer returned from input_.
				260	int buffer_size_; // Size of buffer_.
				261	int buffer_pos_; // Current position within the buffer.
				262	bool read_error_; // Did we previously encounter a read error?
				263
				264	// Line and column number of current_char_ within the whole input stream.
				265	int line_;
				266	int column_;
				267
				268	// String to which text should be appended as we advance through it.
				269	// Call RecordTo(&str) to start recording and StopRecording() to stop.
				270	// E.g. StartToken() calls RecordTo(&current_.text). record_start_ is the
				271	// position within the current buffer where recording started.
				272	string* record_target_;
				273	int record_start_;
				274
				275	// Options.
				276	bool allow_f_after_float_;
				277	CommentStyle comment_style_;
				278	bool require_space_after_number_;
				279	bool allow_multiline_strings_;
				280
				281	// Since we count columns we need to interpret tabs somehow. We'll take
				282	// the standard 8-character definition for lack of any way to do better.
				283	static const int kTabWidth = 8;
				284
				285	// -----------------------------------------------------------------
				286	// Helper methods.
				287
				288	// Consume this character and advance to the next one.
				289	void NextChar();
				290
				291	// Read a new buffer from the input.
				292	void Refresh();
				293
				294	inline void RecordTo(string* target);
				295	inline void StopRecording();
				296
				297	// Called when the current character is the first character of a new
				298	// token (not including whitespace or comments).
				299	inline void StartToken();
				300	// Called when the current character is the first character after the
				301	// end of the last token. After this returns, current_.text will
				302	// contain all text consumed since StartToken() was called.
				303	inline void EndToken();
				304
				305	// Convenience method to add an error at the current line and column.
				306	void AddError(const string& message) {
				307	error_collector_->AddError(line_, column_, message);
				308	}
				309
				310	// -----------------------------------------------------------------
				311	// The following four methods are used to consume tokens of specific
				312	// types. They are actually used to consume all characters after
				313	// the first, since the calling function consumes the first character
				314	// in order to decide what kind of token is being read.
				315
				316	// Read and consume a string, ending when the given delimiter is
				317	// consumed.
				318	void ConsumeString(char delimiter);
				319
				320	// Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
				321	// depending on what was read. This needs to know if the first
				322	// character was a zero in order to correctly recognize hex and octal
				323	// numbers.
				324	// It also needs to know if the first characted was a . to parse floating
				325	// point correctly.
				326	TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
				327
				328	// Consume the rest of a line.
				329	void ConsumeLineComment(string* content);
				330	// Consume until "*/".
				331	void ConsumeBlockComment(string* content);
				332
				333	enum NextCommentStatus {
				334	// Started a line comment.
				335	LINE_COMMENT,
				336
				337	// Started a block comment.
				338	BLOCK_COMMENT,
				339
				340	// Consumed a slash, then realized it wasn't a comment. current_ has
				341	// been filled in with a slash token. The caller should return it.
				342	SLASH_NOT_COMMENT,
				343
				344	// We do not appear to be starting a comment here.
				345	NO_COMMENT
				346	};
				347
				348	// If we're at the start of a new comment, consume it and return what kind
				349	// of comment it is.
				350	NextCommentStatus TryConsumeCommentStart();
				351
				352	// -----------------------------------------------------------------
				353	// These helper methods make the parsing code more readable. The
				354	// "character classes" referred to are defined at the top of the .cc file.
				355	// Basically it is a C++ class with one method:
				356	// static bool InClass(char c);
				357	// The method returns true if c is a member of this "class", like "Letter"
				358	// or "Digit".
				359
				360	// Returns true if the current character is of the given character
				361	// class, but does not consume anything.
				362	template<typename CharacterClass>
				363	inline bool LookingAt();
				364
				365	// If the current character is in the given class, consume it and return
				366	// true. Otherwise return false.
				367	// e.g. TryConsumeOne<Letter>()
				368	template<typename CharacterClass>
				369	inline bool TryConsumeOne();
				370
				371	// Like above, but try to consume the specific character indicated.
				372	inline bool TryConsume(char c);
				373
				374	// Consume zero or more of the given character class.
				375	template<typename CharacterClass>
				376	inline void ConsumeZeroOrMore();
				377
				378	// Consume one or more of the given character class or log the given
				379	// error message.
				380	// e.g. ConsumeOneOrMore<Digit>("Expected digits.");
				381	template<typename CharacterClass>
				382	inline void ConsumeOneOrMore(const char* error);
				383	};
				384
				385	// inline methods ====================================================
				386	inline const Tokenizer::Token& Tokenizer::current() {
				387	return current_;
				388	}
				389
				390	inline const Tokenizer::Token& Tokenizer::previous() {
				391	return previous_;
				392	}
				393
				394	inline void Tokenizer::ParseString(const string& text, string* output) {
				395	output->clear();
				396	ParseStringAppend(text, output);
				397	}
				398
				399	} // namespace io
				400	} // namespace protobuf
				401
				402	} // namespace google
				403	#endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__