blob: 2c561c23d7ba0ffaac0d99c88fff04de1f461680 [file] [log] [blame]
Brian Silverman9c614bc2016-02-15 20:20:02 -05001// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc. All rights reserved.
3// https://developers.google.com/protocol-buffers/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9// * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11// * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15// * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: kenton@google.com (Kenton Varda)
32// Based on original Protocol Buffers design by
33// Sanjay Ghemawat, Jeff Dean, and others.
34//
35// Implements parsing of .proto files to FileDescriptorProtos.
36
37#ifndef GOOGLE_PROTOBUF_COMPILER_PARSER_H__
38#define GOOGLE_PROTOBUF_COMPILER_PARSER_H__
39
40#include <map>
41#include <string>
42#include <utility>
43#include <google/protobuf/descriptor.h>
44#include <google/protobuf/descriptor.pb.h>
45#include <google/protobuf/repeated_field.h>
46#include <google/protobuf/io/tokenizer.h>
47
48namespace google {
49namespace protobuf { class Message; }
50
51namespace protobuf {
52namespace compiler {
53
54// Defined in this file.
55class Parser;
56class SourceLocationTable;
57
58// Implements parsing of protocol definitions (such as .proto files).
59//
60// Note that most users will be more interested in the Importer class.
61// Parser is a lower-level class which simply converts a single .proto file
62// to a FileDescriptorProto. It does not resolve import directives or perform
63// many other kinds of validation needed to construct a complete
64// FileDescriptor.
65class LIBPROTOBUF_EXPORT Parser {
66 public:
67 Parser();
68 ~Parser();
69
70 // Parse the entire input and construct a FileDescriptorProto representing
71 // it. Returns true if no errors occurred, false otherwise.
72 bool Parse(io::Tokenizer* input, FileDescriptorProto* file);
73
74 // Optional fetaures:
75
76 // DEPRECATED: New code should use the SourceCodeInfo embedded in the
77 // FileDescriptorProto.
78 //
79 // Requests that locations of certain definitions be recorded to the given
80 // SourceLocationTable while parsing. This can be used to look up exact line
81 // and column numbers for errors reported by DescriptorPool during validation.
82 // Set to NULL (the default) to discard source location information.
83 void RecordSourceLocationsTo(SourceLocationTable* location_table) {
84 source_location_table_ = location_table;
85 }
86
87 // Requests that errors be recorded to the given ErrorCollector while
88 // parsing. Set to NULL (the default) to discard error messages.
89 void RecordErrorsTo(io::ErrorCollector* error_collector) {
90 error_collector_ = error_collector;
91 }
92
93 // Returns the identifier used in the "syntax = " declaration, if one was
94 // seen during the last call to Parse(), or the empty string otherwise.
95 const string& GetSyntaxIdentifier() { return syntax_identifier_; }
96
97 // If set true, input files will be required to begin with a syntax
98 // identifier. Otherwise, files may omit this. If a syntax identifier
99 // is provided, it must be 'syntax = "proto2";' and must appear at the
100 // top of this file regardless of whether or not it was required.
101 void SetRequireSyntaxIdentifier(bool value) {
102 require_syntax_identifier_ = value;
103 }
104
105 // Call SetStopAfterSyntaxIdentifier(true) to tell the parser to stop
106 // parsing as soon as it has seen the syntax identifier, or lack thereof.
107 // This is useful for quickly identifying the syntax of the file without
108 // parsing the whole thing. If this is enabled, no error will be recorded
109 // if the syntax identifier is something other than "proto2" (since
110 // presumably the caller intends to deal with that), but other kinds of
111 // errors (e.g. parse errors) will still be reported. When this is enabled,
112 // you may pass a NULL FileDescriptorProto to Parse().
113 void SetStopAfterSyntaxIdentifier(bool value) {
114 stop_after_syntax_identifier_ = value;
115 }
116
117 private:
118 class LocationRecorder;
119
120 // =================================================================
121 // Error recovery helpers
122
123 // Consume the rest of the current statement. This consumes tokens
124 // until it sees one of:
125 // ';' Consumes the token and returns.
126 // '{' Consumes the brace then calls SkipRestOfBlock().
127 // '}' Returns without consuming.
128 // EOF Returns (can't consume).
129 // The Parser often calls SkipStatement() after encountering a syntax
130 // error. This allows it to go on parsing the following lines, allowing
131 // it to report more than just one error in the file.
132 void SkipStatement();
133
134 // Consume the rest of the current block, including nested blocks,
135 // ending after the closing '}' is encountered and consumed, or at EOF.
136 void SkipRestOfBlock();
137
138 // -----------------------------------------------------------------
139 // Single-token consuming helpers
140 //
141 // These make parsing code more readable.
142
143 // True if the current token is TYPE_END.
144 inline bool AtEnd();
145
146 // True if the next token matches the given text.
147 inline bool LookingAt(const char* text);
148 // True if the next token is of the given type.
149 inline bool LookingAtType(io::Tokenizer::TokenType token_type);
150
151 // If the next token exactly matches the text given, consume it and return
152 // true. Otherwise, return false without logging an error.
153 bool TryConsume(const char* text);
154
155 // These attempt to read some kind of token from the input. If successful,
156 // they return true. Otherwise they return false and add the given error
157 // to the error list.
158
159 // Consume a token with the exact text given.
160 bool Consume(const char* text, const char* error);
161 // Same as above, but automatically generates the error "Expected \"text\".",
162 // where "text" is the expected token text.
163 bool Consume(const char* text);
164 // Consume a token of type IDENTIFIER and store its text in "output".
165 bool ConsumeIdentifier(string* output, const char* error);
166 // Consume an integer and store its value in "output".
167 bool ConsumeInteger(int* output, const char* error);
168 // Consume a signed integer and store its value in "output".
169 bool ConsumeSignedInteger(int* output, const char* error);
170 // Consume a 64-bit integer and store its value in "output". If the value
171 // is greater than max_value, an error will be reported.
172 bool ConsumeInteger64(uint64 max_value, uint64* output, const char* error);
173 // Consume a number and store its value in "output". This will accept
174 // tokens of either INTEGER or FLOAT type.
175 bool ConsumeNumber(double* output, const char* error);
176 // Consume a string literal and store its (unescaped) value in "output".
177 bool ConsumeString(string* output, const char* error);
178
179 // Consume a token representing the end of the statement. Comments between
180 // this token and the next will be harvested for documentation. The given
181 // LocationRecorder should refer to the declaration that was just parsed;
182 // it will be populated with these comments.
183 //
184 // TODO(kenton): The LocationRecorder is const because historically locations
185 // have been passed around by const reference, for no particularly good
186 // reason. We should probably go through and change them all to mutable
187 // pointer to make this more intuitive.
188 bool TryConsumeEndOfDeclaration(
189 const char* text, const LocationRecorder* location);
190 bool TryConsumeEndOfDeclarationFinishScope(
191 const char* text, const LocationRecorder* location);
192
193 bool ConsumeEndOfDeclaration(
194 const char* text, const LocationRecorder* location);
195
196 // -----------------------------------------------------------------
197 // Error logging helpers
198
199 // Invokes error_collector_->AddError(), if error_collector_ is not NULL.
200 void AddError(int line, int column, const string& error);
201
202 // Invokes error_collector_->AddError() with the line and column number
203 // of the current token.
204 void AddError(const string& error);
205
206 // Records a location in the SourceCodeInfo.location table (see
207 // descriptor.proto). We use RAII to ensure that the start and end locations
208 // are recorded -- the constructor records the start location and the
209 // destructor records the end location. Since the parser is
210 // recursive-descent, this works out beautifully.
211 class LIBPROTOBUF_EXPORT LocationRecorder {
212 public:
213 // Construct the file's "root" location.
214 LocationRecorder(Parser* parser);
215
216 // Construct a location that represents a declaration nested within the
217 // given parent. E.g. a field's location is nested within the location
218 // for a message type. The parent's path will be copied, so you should
219 // call AddPath() only to add the path components leading from the parent
220 // to the child (as opposed to leading from the root to the child).
221 LocationRecorder(const LocationRecorder& parent);
222
223 // Convenience constructors that call AddPath() one or two times.
224 LocationRecorder(const LocationRecorder& parent, int path1);
225 LocationRecorder(const LocationRecorder& parent, int path1, int path2);
226
227 ~LocationRecorder();
228
229 // Add a path component. See SourceCodeInfo.Location.path in
230 // descriptor.proto.
231 void AddPath(int path_component);
232
233 // By default the location is considered to start at the current token at
234 // the time the LocationRecorder is created. StartAt() sets the start
235 // location to the given token instead.
236 void StartAt(const io::Tokenizer::Token& token);
237
238 // Start at the same location as some other LocationRecorder.
239 void StartAt(const LocationRecorder& other);
240
241 // By default the location is considered to end at the previous token at
242 // the time the LocationRecorder is destroyed. EndAt() sets the end
243 // location to the given token instead.
244 void EndAt(const io::Tokenizer::Token& token);
245
246 // Records the start point of this location to the SourceLocationTable that
247 // was passed to RecordSourceLocationsTo(), if any. SourceLocationTable
248 // is an older way of keeping track of source locations which is still
249 // used in some places.
250 void RecordLegacyLocation(const Message* descriptor,
251 DescriptorPool::ErrorCollector::ErrorLocation location);
252
253 // Attaches leading and trailing comments to the location. The two strings
254 // will be swapped into place, so after this is called *leading and
255 // *trailing will be empty.
256 //
257 // TODO(kenton): See comment on TryConsumeEndOfDeclaration(), above, for
258 // why this is const.
259 void AttachComments(string* leading, string* trailing,
260 vector<string>* detached_comments) const;
261
262 private:
263 // Indexes of parent and current location in the parent
264 // SourceCodeInfo.location repeated field. For top-level elements,
265 // parent_index_ is -1.
266 Parser* parser_;
267 SourceCodeInfo::Location* location_;
268
269 void Init(const LocationRecorder& parent);
270 };
271
272 // =================================================================
273 // Parsers for various language constructs
274
275 // Parses the "syntax = \"proto2\";" line at the top of the file. Returns
276 // false if it failed to parse or if the syntax identifier was not
277 // recognized.
278 bool ParseSyntaxIdentifier(const LocationRecorder& parent);
279
280 // These methods parse various individual bits of code. They return
281 // false if they completely fail to parse the construct. In this case,
282 // it is probably necessary to skip the rest of the statement to recover.
283 // However, if these methods return true, it does NOT mean that there
284 // were no errors; only that there were no *syntax* errors. For instance,
285 // if a service method is defined using proper syntax but uses a primitive
286 // type as its input or output, ParseMethodField() still returns true
287 // and only reports the error by calling AddError(). In practice, this
288 // makes logic much simpler for the caller.
289
290 // Parse a top-level message, enum, service, etc.
291 bool ParseTopLevelStatement(FileDescriptorProto* file,
292 const LocationRecorder& root_location);
293
294 // Parse various language high-level language construrcts.
295 bool ParseMessageDefinition(DescriptorProto* message,
296 const LocationRecorder& message_location,
297 const FileDescriptorProto* containing_file);
298 bool ParseEnumDefinition(EnumDescriptorProto* enum_type,
299 const LocationRecorder& enum_location,
300 const FileDescriptorProto* containing_file);
301 bool ParseServiceDefinition(ServiceDescriptorProto* service,
302 const LocationRecorder& service_location,
303 const FileDescriptorProto* containing_file);
304 bool ParsePackage(FileDescriptorProto* file,
305 const LocationRecorder& root_location,
306 const FileDescriptorProto* containing_file);
307 bool ParseImport(RepeatedPtrField<string>* dependency,
308 RepeatedField<int32>* public_dependency,
309 RepeatedField<int32>* weak_dependency,
310 const LocationRecorder& root_location,
311 const FileDescriptorProto* containing_file);
312
313 // These methods parse the contents of a message, enum, or service type and
314 // add them to the given object. They consume the entire block including
315 // the beginning and ending brace.
316 bool ParseMessageBlock(DescriptorProto* message,
317 const LocationRecorder& message_location,
318 const FileDescriptorProto* containing_file);
319 bool ParseEnumBlock(EnumDescriptorProto* enum_type,
320 const LocationRecorder& enum_location,
321 const FileDescriptorProto* containing_file);
322 bool ParseServiceBlock(ServiceDescriptorProto* service,
323 const LocationRecorder& service_location,
324 const FileDescriptorProto* containing_file);
325
326 // Parse one statement within a message, enum, or service block, including
327 // final semicolon.
328 bool ParseMessageStatement(DescriptorProto* message,
329 const LocationRecorder& message_location,
330 const FileDescriptorProto* containing_file);
331 bool ParseEnumStatement(EnumDescriptorProto* message,
332 const LocationRecorder& enum_location,
333 const FileDescriptorProto* containing_file);
334 bool ParseServiceStatement(ServiceDescriptorProto* message,
335 const LocationRecorder& service_location,
336 const FileDescriptorProto* containing_file);
337
338 // Parse a field of a message. If the field is a group, its type will be
339 // added to "messages".
340 //
341 // parent_location and location_field_number_for_nested_type are needed when
342 // parsing groups -- we need to generate a nested message type within the
343 // parent and record its location accordingly. Since the parent could be
344 // either a FileDescriptorProto or a DescriptorProto, we must pass in the
345 // correct field number to use.
346 bool ParseMessageField(FieldDescriptorProto* field,
347 RepeatedPtrField<DescriptorProto>* messages,
348 const LocationRecorder& parent_location,
349 int location_field_number_for_nested_type,
350 const LocationRecorder& field_location,
351 const FileDescriptorProto* containing_file);
352
353 // Like ParseMessageField() but expects the label has already been filled in
354 // by the caller.
355 bool ParseMessageFieldNoLabel(FieldDescriptorProto* field,
356 RepeatedPtrField<DescriptorProto>* messages,
357 const LocationRecorder& parent_location,
358 int location_field_number_for_nested_type,
359 const LocationRecorder& field_location,
360 const FileDescriptorProto* containing_file);
361
362 // Parse an "extensions" declaration.
363 bool ParseExtensions(DescriptorProto* message,
364 const LocationRecorder& extensions_location,
365 const FileDescriptorProto* containing_file);
366
367 // Parse a "reserved" declaration.
368 bool ParseReserved(DescriptorProto* message,
369 const LocationRecorder& message_location);
370 bool ParseReservedNames(DescriptorProto* message,
371 const LocationRecorder& parent_location);
372 bool ParseReservedNumbers(DescriptorProto* message,
373 const LocationRecorder& parent_location);
374
375 // Parse an "extend" declaration. (See also comments for
376 // ParseMessageField().)
377 bool ParseExtend(RepeatedPtrField<FieldDescriptorProto>* extensions,
378 RepeatedPtrField<DescriptorProto>* messages,
379 const LocationRecorder& parent_location,
380 int location_field_number_for_nested_type,
381 const LocationRecorder& extend_location,
382 const FileDescriptorProto* containing_file);
383
384 // Parse a "oneof" declaration. The caller is responsible for setting
385 // oneof_decl->label() since it will have had to parse the label before it
386 // knew it was parsing a oneof.
387 bool ParseOneof(OneofDescriptorProto* oneof_decl,
388 DescriptorProto* containing_type,
389 int oneof_index,
390 const LocationRecorder& oneof_location,
391 const LocationRecorder& containing_type_location,
392 const FileDescriptorProto* containing_file);
393
394 // Parse a single enum value within an enum block.
395 bool ParseEnumConstant(EnumValueDescriptorProto* enum_value,
396 const LocationRecorder& enum_value_location,
397 const FileDescriptorProto* containing_file);
398
399 // Parse enum constant options, i.e. the list in square brackets at the end
400 // of the enum constant value definition.
401 bool ParseEnumConstantOptions(EnumValueDescriptorProto* value,
402 const LocationRecorder& enum_value_location,
403 const FileDescriptorProto* containing_file);
404
405 // Parse a single method within a service definition.
406 bool ParseServiceMethod(MethodDescriptorProto* method,
407 const LocationRecorder& method_location,
408 const FileDescriptorProto* containing_file);
409
410
411 // Parse options of a single method or stream.
412 bool ParseMethodOptions(const LocationRecorder& parent_location,
413 const FileDescriptorProto* containing_file,
414 const int optionsFieldNumber,
415 Message* mutable_options);
416
417 // Parse "required", "optional", or "repeated" and fill in "label"
418 // with the value. Returns true if such a label is consumed.
419 bool ParseLabel(FieldDescriptorProto::Label* label,
420 const FileDescriptorProto* containing_file);
421
422 // Parse a type name and fill in "type" (if it is a primitive) or
423 // "type_name" (if it is not) with the type parsed.
424 bool ParseType(FieldDescriptorProto::Type* type,
425 string* type_name);
426 // Parse a user-defined type and fill in "type_name" with the name.
427 // If a primitive type is named, it is treated as an error.
428 bool ParseUserDefinedType(string* type_name);
429
430 // Parses field options, i.e. the stuff in square brackets at the end
431 // of a field definition. Also parses default value.
432 bool ParseFieldOptions(FieldDescriptorProto* field,
433 const LocationRecorder& field_location,
434 const FileDescriptorProto* containing_file);
435
436 // Parse the "default" option. This needs special handling because its
437 // type is the field's type.
438 bool ParseDefaultAssignment(FieldDescriptorProto* field,
439 const LocationRecorder& field_location,
440 const FileDescriptorProto* containing_file);
441
442 bool ParseJsonName(FieldDescriptorProto* field,
443 const LocationRecorder& field_location,
444 const FileDescriptorProto* containing_file);
445
446 enum OptionStyle {
447 OPTION_ASSIGNMENT, // just "name = value"
448 OPTION_STATEMENT // "option name = value;"
449 };
450
451 // Parse a single option name/value pair, e.g. "ctype = CORD". The name
452 // identifies a field of the given Message, and the value of that field
453 // is set to the parsed value.
454 bool ParseOption(Message* options,
455 const LocationRecorder& options_location,
456 const FileDescriptorProto* containing_file,
457 OptionStyle style);
458
459 // Parses a single part of a multipart option name. A multipart name consists
460 // of names separated by dots. Each name is either an identifier or a series
461 // of identifiers separated by dots and enclosed in parentheses. E.g.,
462 // "foo.(bar.baz).qux".
463 bool ParseOptionNamePart(UninterpretedOption* uninterpreted_option,
464 const LocationRecorder& part_location,
465 const FileDescriptorProto* containing_file);
466
467 // Parses a string surrounded by balanced braces. Strips off the outer
468 // braces and stores the enclosed string in *value.
469 // E.g.,
470 // { foo } *value gets 'foo'
471 // { foo { bar: box } } *value gets 'foo { bar: box }'
472 // {} *value gets ''
473 //
474 // REQUIRES: LookingAt("{")
475 // When finished successfully, we are looking at the first token past
476 // the ending brace.
477 bool ParseUninterpretedBlock(string* value);
478
479 struct MapField {
480 // Whether the field is a map field.
481 bool is_map_field;
482 // The types of the key and value if they are primitive types.
483 FieldDescriptorProto::Type key_type;
484 FieldDescriptorProto::Type value_type;
485 // Or the type names string if the types are customized types.
486 string key_type_name;
487 string value_type_name;
488
489 MapField() : is_map_field(false) {}
490 };
491 // Desugar the map syntax to generate a nested map entry message.
492 void GenerateMapEntry(const MapField& map_field, FieldDescriptorProto* field,
493 RepeatedPtrField<DescriptorProto>* messages);
494
495 // Whether fields without label default to optional fields.
496 bool DefaultToOptionalFields() const {
497 return syntax_identifier_ == "proto3";
498 }
499
500
501 bool ValidateEnum(const EnumDescriptorProto* proto);
502
503 // =================================================================
504
505 io::Tokenizer* input_;
506 io::ErrorCollector* error_collector_;
507 SourceCodeInfo* source_code_info_;
508 SourceLocationTable* source_location_table_; // legacy
509 bool had_errors_;
510 bool require_syntax_identifier_;
511 bool stop_after_syntax_identifier_;
512 string syntax_identifier_;
513
514 // Leading doc comments for the next declaration. These are not complete
515 // yet; use ConsumeEndOfDeclaration() to get the complete comments.
516 string upcoming_doc_comments_;
517
518 // Detached comments are not connected to any syntax entities. Elements in
519 // this vector are paragraphs of comments separated by empty lines. The
520 // detached comments will be put into the leading_detached_comments field for
521 // the next element (See SourceCodeInfo.Location in descriptor.proto), when
522 // ConsumeEndOfDeclaration() is called.
523 vector<string> upcoming_detached_comments_;
524
525 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Parser);
526};
527
528// A table mapping (descriptor, ErrorLocation) pairs -- as reported by
529// DescriptorPool when validating descriptors -- to line and column numbers
530// within the original source code.
531//
532// This is semi-obsolete: FileDescriptorProto.source_code_info now contains
533// far more complete information about source locations. However, as of this
534// writing you still need to use SourceLocationTable when integrating with
535// DescriptorPool.
536class LIBPROTOBUF_EXPORT SourceLocationTable {
537 public:
538 SourceLocationTable();
539 ~SourceLocationTable();
540
541 // Finds the precise location of the given error and fills in *line and
542 // *column with the line and column numbers. If not found, sets *line to
543 // -1 and *column to 0 (since line = -1 is used to mean "error has no exact
544 // location" in the ErrorCollector interface). Returns true if found, false
545 // otherwise.
546 bool Find(const Message* descriptor,
547 DescriptorPool::ErrorCollector::ErrorLocation location,
548 int* line, int* column) const;
549
550 // Adds a location to the table.
551 void Add(const Message* descriptor,
552 DescriptorPool::ErrorCollector::ErrorLocation location,
553 int line, int column);
554
555 // Clears the contents of the table.
556 void Clear();
557
558 private:
559 typedef map<
560 pair<const Message*, DescriptorPool::ErrorCollector::ErrorLocation>,
561 pair<int, int> > LocationMap;
562 LocationMap location_map_;
563};
564
565} // namespace compiler
566} // namespace protobuf
567
568} // namespace google
569#endif // GOOGLE_PROTOBUF_COMPILER_PARSER_H__