blob: 2cbd21bc2561819c55009c4cd1cf4efcfbaa0f09 [file] [log] [blame]
Brian Silverman9c614bc2016-02-15 20:20:02 -05001# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc. All rights reserved.
3# https://developers.google.com/protocol-buffers/
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9# * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11# * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15# * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""Contains routines for printing protocol messages in text format.
32
33Simple usage example:
34
35 # Create a proto object and serialize it to a text proto string.
36 message = my_proto_pb2.MyMessage(foo='bar')
37 text_proto = text_format.MessageToString(message)
38
39 # Parse a text proto string.
40 message = text_format.Parse(text_proto, my_proto_pb2.MyMessage())
41"""
42
43__author__ = 'kenton@google.com (Kenton Varda)'
44
45import io
46import re
47
48import six
49
50if six.PY3:
Austin Schuh40c16522018-10-28 20:27:54 -070051 long = int # pylint: disable=redefined-builtin,invalid-name
Brian Silverman9c614bc2016-02-15 20:20:02 -050052
Austin Schuh40c16522018-10-28 20:27:54 -070053# pylint: disable=g-import-not-at-top
Brian Silverman9c614bc2016-02-15 20:20:02 -050054from google.protobuf.internal import type_checkers
55from google.protobuf import descriptor
56from google.protobuf import text_encoding
57
Austin Schuh40c16522018-10-28 20:27:54 -070058__all__ = ['MessageToString', 'PrintMessage', 'PrintField', 'PrintFieldValue',
59 'Merge']
Brian Silverman9c614bc2016-02-15 20:20:02 -050060
61_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
62 type_checkers.Int32ValueChecker(),
63 type_checkers.Uint64ValueChecker(),
64 type_checkers.Int64ValueChecker())
65_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?', re.IGNORECASE)
66_FLOAT_NAN = re.compile('nanf?', re.IGNORECASE)
67_FLOAT_TYPES = frozenset([descriptor.FieldDescriptor.CPPTYPE_FLOAT,
68 descriptor.FieldDescriptor.CPPTYPE_DOUBLE])
69_QUOTES = frozenset(("'", '"'))
Austin Schuh40c16522018-10-28 20:27:54 -070070_ANY_FULL_TYPE_NAME = 'google.protobuf.Any'
Brian Silverman9c614bc2016-02-15 20:20:02 -050071
72
73class Error(Exception):
74 """Top-level module error for text_format."""
75
76
77class ParseError(Error):
Austin Schuh40c16522018-10-28 20:27:54 -070078 """Thrown in case of text parsing or tokenizing error."""
79
80 def __init__(self, message=None, line=None, column=None):
81 if message is not None and line is not None:
82 loc = str(line)
83 if column is not None:
84 loc += ':{0}'.format(column)
85 message = '{0} : {1}'.format(loc, message)
86 if message is not None:
87 super(ParseError, self).__init__(message)
88 else:
89 super(ParseError, self).__init__()
90 self._line = line
91 self._column = column
92
93 def GetLine(self):
94 return self._line
95
96 def GetColumn(self):
97 return self._column
Brian Silverman9c614bc2016-02-15 20:20:02 -050098
99
100class TextWriter(object):
Austin Schuh40c16522018-10-28 20:27:54 -0700101
Brian Silverman9c614bc2016-02-15 20:20:02 -0500102 def __init__(self, as_utf8):
103 if six.PY2:
104 self._writer = io.BytesIO()
105 else:
106 self._writer = io.StringIO()
107
108 def write(self, val):
109 if six.PY2:
110 if isinstance(val, six.text_type):
111 val = val.encode('utf-8')
112 return self._writer.write(val)
113
114 def close(self):
115 return self._writer.close()
116
117 def getvalue(self):
118 return self._writer.getvalue()
119
120
Austin Schuh40c16522018-10-28 20:27:54 -0700121def MessageToString(message,
122 as_utf8=False,
123 as_one_line=False,
124 pointy_brackets=False,
125 use_index_order=False,
126 float_format=None,
127 use_field_number=False,
128 descriptor_pool=None,
129 indent=0,
130 message_formatter=None):
Brian Silverman9c614bc2016-02-15 20:20:02 -0500131 """Convert protobuf message to text format.
132
133 Floating point values can be formatted compactly with 15 digits of
134 precision (which is the most that IEEE 754 "double" can guarantee)
135 using float_format='.15g'. To ensure that converting to text and back to a
136 proto will result in an identical value, float_format='.17g' should be used.
137
138 Args:
139 message: The protocol buffers message.
140 as_utf8: Produce text output in UTF8 format.
141 as_one_line: Don't introduce newlines between fields.
142 pointy_brackets: If True, use angle brackets instead of curly braces for
143 nesting.
Austin Schuh40c16522018-10-28 20:27:54 -0700144 use_index_order: If True, fields of a proto message will be printed using
145 the order defined in source code instead of the field number, extensions
146 will be printed at the end of the message and their relative order is
147 determined by the extension number. By default, use the field number
148 order.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500149 float_format: If set, use this to specify floating point number formatting
150 (per the "Format Specification Mini-Language"); otherwise, str() is used.
Austin Schuh40c16522018-10-28 20:27:54 -0700151 use_field_number: If True, print field numbers instead of names.
152 descriptor_pool: A DescriptorPool used to resolve Any types.
153 indent: The indent level, in terms of spaces, for pretty print.
154 message_formatter: A function(message, indent, as_one_line): unicode|None
155 to custom format selected sub-messages (usually based on message type).
156 Use to pretty print parts of the protobuf for easier diffing.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500157
158 Returns:
159 A string of the text formatted protocol buffer message.
160 """
161 out = TextWriter(as_utf8)
Austin Schuh40c16522018-10-28 20:27:54 -0700162 printer = _Printer(out, indent, as_utf8, as_one_line, pointy_brackets,
163 use_index_order, float_format, use_field_number,
164 descriptor_pool, message_formatter)
165 printer.PrintMessage(message)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500166 result = out.getvalue()
167 out.close()
168 if as_one_line:
169 return result.rstrip()
170 return result
171
172
173def _IsMapEntry(field):
174 return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
175 field.message_type.has_options and
176 field.message_type.GetOptions().map_entry)
177
178
Austin Schuh40c16522018-10-28 20:27:54 -0700179def PrintMessage(message,
180 out,
181 indent=0,
182 as_utf8=False,
183 as_one_line=False,
184 pointy_brackets=False,
185 use_index_order=False,
186 float_format=None,
187 use_field_number=False,
188 descriptor_pool=None,
189 message_formatter=None):
190 printer = _Printer(out, indent, as_utf8, as_one_line, pointy_brackets,
191 use_index_order, float_format, use_field_number,
192 descriptor_pool, message_formatter)
193 printer.PrintMessage(message)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500194
195
Austin Schuh40c16522018-10-28 20:27:54 -0700196def PrintField(field,
197 value,
198 out,
199 indent=0,
200 as_utf8=False,
201 as_one_line=False,
202 pointy_brackets=False,
203 use_index_order=False,
204 float_format=None,
205 message_formatter=None):
206 """Print a single field name/value pair."""
207 printer = _Printer(out, indent, as_utf8, as_one_line, pointy_brackets,
208 use_index_order, float_format, message_formatter)
209 printer.PrintField(field, value)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500210
211
Austin Schuh40c16522018-10-28 20:27:54 -0700212def PrintFieldValue(field,
213 value,
214 out,
215 indent=0,
216 as_utf8=False,
217 as_one_line=False,
218 pointy_brackets=False,
Brian Silverman9c614bc2016-02-15 20:20:02 -0500219 use_index_order=False,
Austin Schuh40c16522018-10-28 20:27:54 -0700220 float_format=None,
221 message_formatter=None):
222 """Print a single field value (not including name)."""
223 printer = _Printer(out, indent, as_utf8, as_one_line, pointy_brackets,
224 use_index_order, float_format, message_formatter)
225 printer.PrintFieldValue(field, value)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500226
Brian Silverman9c614bc2016-02-15 20:20:02 -0500227
Austin Schuh40c16522018-10-28 20:27:54 -0700228def _BuildMessageFromTypeName(type_name, descriptor_pool):
229 """Returns a protobuf message instance.
230
231 Args:
232 type_name: Fully-qualified protobuf message type name string.
233 descriptor_pool: DescriptorPool instance.
234
235 Returns:
236 A Message instance of type matching type_name, or None if the a Descriptor
237 wasn't found matching type_name.
238 """
239 # pylint: disable=g-import-not-at-top
240 if descriptor_pool is None:
241 from google.protobuf import descriptor_pool as pool_mod
242 descriptor_pool = pool_mod.Default()
243 from google.protobuf import symbol_database
244 database = symbol_database.Default()
245 try:
246 message_descriptor = descriptor_pool.FindMessageTypeByName(type_name)
247 except KeyError:
248 return None
249 message_type = database.GetPrototype(message_descriptor)
250 return message_type()
251
252
253class _Printer(object):
254 """Text format printer for protocol message."""
255
256 def __init__(self,
257 out,
258 indent=0,
259 as_utf8=False,
260 as_one_line=False,
261 pointy_brackets=False,
262 use_index_order=False,
263 float_format=None,
264 use_field_number=False,
265 descriptor_pool=None,
266 message_formatter=None):
267 """Initialize the Printer.
268
269 Floating point values can be formatted compactly with 15 digits of
270 precision (which is the most that IEEE 754 "double" can guarantee)
271 using float_format='.15g'. To ensure that converting to text and back to a
272 proto will result in an identical value, float_format='.17g' should be used.
273
274 Args:
275 out: To record the text format result.
276 indent: The indent level for pretty print.
277 as_utf8: Produce text output in UTF8 format.
278 as_one_line: Don't introduce newlines between fields.
279 pointy_brackets: If True, use angle brackets instead of curly braces for
280 nesting.
281 use_index_order: If True, print fields of a proto message using the order
282 defined in source code instead of the field number. By default, use the
283 field number order.
284 float_format: If set, use this to specify floating point number formatting
285 (per the "Format Specification Mini-Language"); otherwise, str() is
286 used.
287 use_field_number: If True, print field numbers instead of names.
288 descriptor_pool: A DescriptorPool used to resolve Any types.
289 message_formatter: A function(message, indent, as_one_line): unicode|None
290 to custom format selected sub-messages (usually based on message type).
291 Use to pretty print parts of the protobuf for easier diffing.
292 """
293 self.out = out
294 self.indent = indent
295 self.as_utf8 = as_utf8
296 self.as_one_line = as_one_line
297 self.pointy_brackets = pointy_brackets
298 self.use_index_order = use_index_order
299 self.float_format = float_format
300 self.use_field_number = use_field_number
301 self.descriptor_pool = descriptor_pool
302 self.message_formatter = message_formatter
303
304 def _TryPrintAsAnyMessage(self, message):
305 """Serializes if message is a google.protobuf.Any field."""
306 packed_message = _BuildMessageFromTypeName(message.TypeName(),
307 self.descriptor_pool)
308 if packed_message:
309 packed_message.MergeFromString(message.value)
310 self.out.write('%s[%s]' % (self.indent * ' ', message.type_url))
311 self._PrintMessageFieldValue(packed_message)
312 self.out.write(' ' if self.as_one_line else '\n')
313 return True
Brian Silverman9c614bc2016-02-15 20:20:02 -0500314 else:
Austin Schuh40c16522018-10-28 20:27:54 -0700315 return False
316
317 def _TryCustomFormatMessage(self, message):
318 formatted = self.message_formatter(message, self.indent, self.as_one_line)
319 if formatted is None:
320 return False
321
322 out = self.out
323 out.write(' ' * self.indent)
324 out.write(formatted)
325 out.write(' ' if self.as_one_line else '\n')
326 return True
327
328 def PrintMessage(self, message):
329 """Convert protobuf message to text format.
330
331 Args:
332 message: The protocol buffers message.
333 """
334 if self.message_formatter and self._TryCustomFormatMessage(message):
335 return
336 if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and
337 self._TryPrintAsAnyMessage(message)):
338 return
339 fields = message.ListFields()
340 if self.use_index_order:
341 fields.sort(
342 key=lambda x: x[0].number if x[0].is_extension else x[0].index)
343 for field, value in fields:
344 if _IsMapEntry(field):
345 for key in sorted(value):
346 # This is slow for maps with submessage entries because it copies the
347 # entire tree. Unfortunately this would take significant refactoring
348 # of this file to work around.
349 #
350 # TODO(haberman): refactor and optimize if this becomes an issue.
351 entry_submsg = value.GetEntryClass()(key=key, value=value[key])
352 self.PrintField(field, entry_submsg)
353 elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
354 for element in value:
355 self.PrintField(field, element)
356 else:
357 self.PrintField(field, value)
358
359 def PrintField(self, field, value):
360 """Print a single field name/value pair."""
361 out = self.out
362 out.write(' ' * self.indent)
363 if self.use_field_number:
364 out.write(str(field.number))
365 else:
366 if field.is_extension:
367 out.write('[')
368 if (field.containing_type.GetOptions().message_set_wire_format and
369 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
370 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
371 out.write(field.message_type.full_name)
372 else:
373 out.write(field.full_name)
374 out.write(']')
375 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
376 # For groups, use the capitalized name.
377 out.write(field.message_type.name)
378 else:
379 out.write(field.name)
380
381 if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
382 # The colon is optional in this case, but our cross-language golden files
383 # don't include it.
384 out.write(': ')
385
386 self.PrintFieldValue(field, value)
387 if self.as_one_line:
388 out.write(' ')
389 else:
390 out.write('\n')
391
392 def _PrintMessageFieldValue(self, value):
393 if self.pointy_brackets:
394 openb = '<'
395 closeb = '>'
396 else:
397 openb = '{'
398 closeb = '}'
399
400 if self.as_one_line:
401 self.out.write(' %s ' % openb)
402 self.PrintMessage(value)
403 self.out.write(closeb)
404 else:
405 self.out.write(' %s\n' % openb)
406 self.indent += 2
407 self.PrintMessage(value)
408 self.indent -= 2
409 self.out.write(' ' * self.indent + closeb)
410
411 def PrintFieldValue(self, field, value):
412 """Print a single field value (not including name).
413
414 For repeated fields, the value should be a single element.
415
416 Args:
417 field: The descriptor of the field to be printed.
418 value: The value of the field.
419 """
420 out = self.out
421 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
422 self._PrintMessageFieldValue(value)
423 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
424 enum_value = field.enum_type.values_by_number.get(value, None)
425 if enum_value is not None:
426 out.write(enum_value.name)
427 else:
428 out.write(str(value))
429 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
430 out.write('\"')
431 if isinstance(value, six.text_type):
432 out_value = value.encode('utf-8')
433 else:
434 out_value = value
435 if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
436 # We need to escape non-UTF8 chars in TYPE_BYTES field.
437 out_as_utf8 = False
438 else:
439 out_as_utf8 = self.as_utf8
440 out.write(text_encoding.CEscape(out_value, out_as_utf8))
441 out.write('\"')
442 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
443 if value:
444 out.write('true')
445 else:
446 out.write('false')
447 elif field.cpp_type in _FLOAT_TYPES and self.float_format is not None:
448 out.write('{1:{0}}'.format(self.float_format, value))
Brian Silverman9c614bc2016-02-15 20:20:02 -0500449 else:
450 out.write(str(value))
Brian Silverman9c614bc2016-02-15 20:20:02 -0500451
452
Austin Schuh40c16522018-10-28 20:27:54 -0700453def Parse(text,
454 message,
455 allow_unknown_extension=False,
456 allow_field_number=False,
457 descriptor_pool=None):
458 """Parses a text representation of a protocol message into a message.
459
460 NOTE: for historical reasons this function does not clear the input
461 message. This is different from what the binary msg.ParseFrom(...) does.
462
463 Example
464 a = MyProto()
465 a.repeated_field.append('test')
466 b = MyProto()
467
468 text_format.Parse(repr(a), b)
469 text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"]
470
471 # Binary version:
472 b.ParseFromString(a.SerializeToString()) # repeated_field is now "test"
473
474 Caller is responsible for clearing the message as needed.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500475
476 Args:
477 text: Message text representation.
478 message: A protocol buffer message to merge into.
479 allow_unknown_extension: if True, skip over missing extensions and keep
480 parsing
Austin Schuh40c16522018-10-28 20:27:54 -0700481 allow_field_number: if True, both field number and field name are allowed.
482 descriptor_pool: A DescriptorPool used to resolve Any types.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500483
484 Returns:
485 The same message passed as argument.
486
487 Raises:
488 ParseError: On text parsing problems.
489 """
490 if not isinstance(text, str):
Austin Schuh40c16522018-10-28 20:27:54 -0700491 if six.PY3:
492 text = text.decode('utf-8')
493 else:
494 text = text.encode('utf-8')
495 return ParseLines(text.split('\n'),
496 message,
497 allow_unknown_extension,
498 allow_field_number,
499 descriptor_pool=descriptor_pool)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500500
501
Austin Schuh40c16522018-10-28 20:27:54 -0700502def Merge(text,
503 message,
504 allow_unknown_extension=False,
505 allow_field_number=False,
506 descriptor_pool=None):
507 """Parses a text representation of a protocol message into a message.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500508
509 Like Parse(), but allows repeated values for a non-repeated field, and uses
510 the last one.
511
512 Args:
513 text: Message text representation.
514 message: A protocol buffer message to merge into.
515 allow_unknown_extension: if True, skip over missing extensions and keep
516 parsing
Austin Schuh40c16522018-10-28 20:27:54 -0700517 allow_field_number: if True, both field number and field name are allowed.
518 descriptor_pool: A DescriptorPool used to resolve Any types.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500519
520 Returns:
521 The same message passed as argument.
522
523 Raises:
524 ParseError: On text parsing problems.
525 """
Austin Schuh40c16522018-10-28 20:27:54 -0700526 if not isinstance(text, str):
527 if six.PY3:
528 text = text.decode('utf-8')
529 else:
530 text = text.encode('utf-8')
531 return MergeLines(
532 text.split('\n'),
533 message,
534 allow_unknown_extension,
535 allow_field_number,
536 descriptor_pool=descriptor_pool)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500537
538
Austin Schuh40c16522018-10-28 20:27:54 -0700539def ParseLines(lines,
540 message,
541 allow_unknown_extension=False,
542 allow_field_number=False,
543 descriptor_pool=None):
544 """Parses a text representation of a protocol message into a message.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500545
546 Args:
547 lines: An iterable of lines of a message's text representation.
548 message: A protocol buffer message to merge into.
549 allow_unknown_extension: if True, skip over missing extensions and keep
550 parsing
Austin Schuh40c16522018-10-28 20:27:54 -0700551 allow_field_number: if True, both field number and field name are allowed.
552 descriptor_pool: A DescriptorPool used to resolve Any types.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500553
554 Returns:
555 The same message passed as argument.
556
557 Raises:
558 ParseError: On text parsing problems.
559 """
Austin Schuh40c16522018-10-28 20:27:54 -0700560 parser = _Parser(allow_unknown_extension,
561 allow_field_number,
562 descriptor_pool=descriptor_pool)
563 return parser.ParseLines(lines, message)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500564
565
Austin Schuh40c16522018-10-28 20:27:54 -0700566def MergeLines(lines,
567 message,
568 allow_unknown_extension=False,
569 allow_field_number=False,
570 descriptor_pool=None):
571 """Parses a text representation of a protocol message into a message.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500572
573 Args:
574 lines: An iterable of lines of a message's text representation.
575 message: A protocol buffer message to merge into.
576 allow_unknown_extension: if True, skip over missing extensions and keep
577 parsing
Austin Schuh40c16522018-10-28 20:27:54 -0700578 allow_field_number: if True, both field number and field name are allowed.
579 descriptor_pool: A DescriptorPool used to resolve Any types.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500580
581 Returns:
582 The same message passed as argument.
583
584 Raises:
585 ParseError: On text parsing problems.
586 """
Austin Schuh40c16522018-10-28 20:27:54 -0700587 parser = _Parser(allow_unknown_extension,
588 allow_field_number,
589 descriptor_pool=descriptor_pool)
590 return parser.MergeLines(lines, message)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500591
592
Austin Schuh40c16522018-10-28 20:27:54 -0700593class _Parser(object):
594 """Text format parser for protocol message."""
Brian Silverman9c614bc2016-02-15 20:20:02 -0500595
Austin Schuh40c16522018-10-28 20:27:54 -0700596 def __init__(self,
597 allow_unknown_extension=False,
598 allow_field_number=False,
599 descriptor_pool=None):
600 self.allow_unknown_extension = allow_unknown_extension
601 self.allow_field_number = allow_field_number
602 self.descriptor_pool = descriptor_pool
Brian Silverman9c614bc2016-02-15 20:20:02 -0500603
Austin Schuh40c16522018-10-28 20:27:54 -0700604 def ParseFromString(self, text, message):
605 """Parses a text representation of a protocol message into a message."""
606 if not isinstance(text, str):
607 text = text.decode('utf-8')
608 return self.ParseLines(text.split('\n'), message)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500609
Austin Schuh40c16522018-10-28 20:27:54 -0700610 def ParseLines(self, lines, message):
611 """Parses a text representation of a protocol message into a message."""
612 self._allow_multiple_scalars = False
613 self._ParseOrMerge(lines, message)
614 return message
Brian Silverman9c614bc2016-02-15 20:20:02 -0500615
Austin Schuh40c16522018-10-28 20:27:54 -0700616 def MergeFromString(self, text, message):
617 """Merges a text representation of a protocol message into a message."""
618 return self._MergeLines(text.split('\n'), message)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500619
Austin Schuh40c16522018-10-28 20:27:54 -0700620 def MergeLines(self, lines, message):
621 """Merges a text representation of a protocol message into a message."""
622 self._allow_multiple_scalars = True
623 self._ParseOrMerge(lines, message)
624 return message
Brian Silverman9c614bc2016-02-15 20:20:02 -0500625
Austin Schuh40c16522018-10-28 20:27:54 -0700626 def _ParseOrMerge(self, lines, message):
627 """Converts a text representation of a protocol message into a message.
628
629 Args:
630 lines: Lines of a message's text representation.
631 message: A protocol buffer message to merge into.
632
633 Raises:
634 ParseError: On text parsing problems.
635 """
636 tokenizer = Tokenizer(lines)
637 while not tokenizer.AtEnd():
638 self._MergeField(tokenizer, message)
639
640 def _MergeField(self, tokenizer, message):
641 """Merges a single protocol message field into a message.
642
643 Args:
644 tokenizer: A tokenizer to parse the field name and values.
645 message: A protocol message to record the data.
646
647 Raises:
648 ParseError: In case of text parsing problems.
649 """
650 message_descriptor = message.DESCRIPTOR
651 if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and
652 tokenizer.TryConsume('[')):
653 type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer)
654 tokenizer.Consume(']')
655 tokenizer.TryConsume(':')
656 if tokenizer.TryConsume('<'):
657 expanded_any_end_token = '>'
658 else:
659 tokenizer.Consume('{')
660 expanded_any_end_token = '}'
661 expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name,
662 self.descriptor_pool)
663 if not expanded_any_sub_message:
664 raise ParseError('Type %s not found in descriptor pool' %
665 packed_type_name)
666 while not tokenizer.TryConsume(expanded_any_end_token):
667 if tokenizer.AtEnd():
668 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' %
669 (expanded_any_end_token,))
670 self._MergeField(tokenizer, expanded_any_sub_message)
671 message.Pack(expanded_any_sub_message,
672 type_url_prefix=type_url_prefix)
673 return
674
675 if tokenizer.TryConsume('['):
676 name = [tokenizer.ConsumeIdentifier()]
677 while tokenizer.TryConsume('.'):
678 name.append(tokenizer.ConsumeIdentifier())
679 name = '.'.join(name)
680
681 if not message_descriptor.is_extendable:
682 raise tokenizer.ParseErrorPreviousToken(
683 'Message type "%s" does not have extensions.' %
684 message_descriptor.full_name)
685 # pylint: disable=protected-access
686 field = message.Extensions._FindExtensionByName(name)
687 # pylint: enable=protected-access
688 if not field:
689 if self.allow_unknown_extension:
690 field = None
691 else:
692 raise tokenizer.ParseErrorPreviousToken(
693 'Extension "%s" not registered. '
694 'Did you import the _pb2 module which defines it? '
695 'If you are trying to place the extension in the MessageSet '
696 'field of another message that is in an Any or MessageSet field, '
697 'that message\'s _pb2 module must be imported as well' % name)
698 elif message_descriptor != field.containing_type:
699 raise tokenizer.ParseErrorPreviousToken(
700 'Extension "%s" does not extend message type "%s".' %
701 (name, message_descriptor.full_name))
702
703 tokenizer.Consume(']')
704
705 else:
706 name = tokenizer.ConsumeIdentifierOrNumber()
707 if self.allow_field_number and name.isdigit():
708 number = ParseInteger(name, True, True)
709 field = message_descriptor.fields_by_number.get(number, None)
710 if not field and message_descriptor.is_extendable:
711 field = message.Extensions._FindExtensionByNumber(number)
712 else:
713 field = message_descriptor.fields_by_name.get(name, None)
714
715 # Group names are expected to be capitalized as they appear in the
716 # .proto file, which actually matches their type names, not their field
717 # names.
718 if not field:
719 field = message_descriptor.fields_by_name.get(name.lower(), None)
720 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
721 field = None
722
723 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
724 field.message_type.name != name):
725 field = None
726
727 if not field:
728 raise tokenizer.ParseErrorPreviousToken(
729 'Message type "%s" has no field named "%s".' %
730 (message_descriptor.full_name, name))
731
732 if field:
733 if not self._allow_multiple_scalars and field.containing_oneof:
734 # Check if there's a different field set in this oneof.
735 # Note that we ignore the case if the same field was set before, and we
736 # apply _allow_multiple_scalars to non-scalar fields as well.
737 which_oneof = message.WhichOneof(field.containing_oneof.name)
738 if which_oneof is not None and which_oneof != field.name:
739 raise tokenizer.ParseErrorPreviousToken(
740 'Field "%s" is specified along with field "%s", another member '
741 'of oneof "%s" for message type "%s".' %
742 (field.name, which_oneof, field.containing_oneof.name,
743 message_descriptor.full_name))
744
745 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
746 tokenizer.TryConsume(':')
747 merger = self._MergeMessageField
748 else:
749 tokenizer.Consume(':')
750 merger = self._MergeScalarField
751
752 if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and
753 tokenizer.TryConsume('[')):
754 # Short repeated format, e.g. "foo: [1, 2, 3]"
755 if not tokenizer.TryConsume(']'):
756 while True:
757 merger(tokenizer, message, field)
758 if tokenizer.TryConsume(']'):
759 break
760 tokenizer.Consume(',')
761
762 else:
763 merger(tokenizer, message, field)
764
765 else: # Proto field is unknown.
766 assert self.allow_unknown_extension
767 _SkipFieldContents(tokenizer)
768
769 # For historical reasons, fields may optionally be separated by commas or
770 # semicolons.
771 if not tokenizer.TryConsume(','):
772 tokenizer.TryConsume(';')
773
774 def _ConsumeAnyTypeUrl(self, tokenizer):
775 """Consumes a google.protobuf.Any type URL and returns the type name."""
776 # Consume "type.googleapis.com/".
777 prefix = [tokenizer.ConsumeIdentifier()]
778 tokenizer.Consume('.')
779 prefix.append(tokenizer.ConsumeIdentifier())
780 tokenizer.Consume('.')
781 prefix.append(tokenizer.ConsumeIdentifier())
782 tokenizer.Consume('/')
783 # Consume the fully-qualified type name.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500784 name = [tokenizer.ConsumeIdentifier()]
785 while tokenizer.TryConsume('.'):
786 name.append(tokenizer.ConsumeIdentifier())
Austin Schuh40c16522018-10-28 20:27:54 -0700787 return '.'.join(prefix), '.'.join(name)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500788
Austin Schuh40c16522018-10-28 20:27:54 -0700789 def _MergeMessageField(self, tokenizer, message, field):
790 """Merges a single scalar field into a message.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500791
Austin Schuh40c16522018-10-28 20:27:54 -0700792 Args:
793 tokenizer: A tokenizer to parse the field value.
794 message: The message of which field is a member.
795 field: The descriptor of the field to be merged.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500796
Austin Schuh40c16522018-10-28 20:27:54 -0700797 Raises:
798 ParseError: In case of text parsing problems.
799 """
Brian Silverman9c614bc2016-02-15 20:20:02 -0500800 is_map_entry = _IsMapEntry(field)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500801
802 if tokenizer.TryConsume('<'):
803 end_token = '>'
804 else:
805 tokenizer.Consume('{')
806 end_token = '}'
807
808 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
809 if field.is_extension:
810 sub_message = message.Extensions[field].add()
811 elif is_map_entry:
Austin Schuh40c16522018-10-28 20:27:54 -0700812 sub_message = getattr(message, field.name).GetEntryClass()()
Brian Silverman9c614bc2016-02-15 20:20:02 -0500813 else:
814 sub_message = getattr(message, field.name).add()
815 else:
816 if field.is_extension:
Austin Schuh40c16522018-10-28 20:27:54 -0700817 if (not self._allow_multiple_scalars and
818 message.HasExtension(field)):
819 raise tokenizer.ParseErrorPreviousToken(
820 'Message type "%s" should not have multiple "%s" extensions.' %
821 (message.DESCRIPTOR.full_name, field.full_name))
Brian Silverman9c614bc2016-02-15 20:20:02 -0500822 sub_message = message.Extensions[field]
823 else:
Austin Schuh40c16522018-10-28 20:27:54 -0700824 # Also apply _allow_multiple_scalars to message field.
825 # TODO(jieluo): Change to _allow_singular_overwrites.
826 if (not self._allow_multiple_scalars and
827 message.HasField(field.name)):
828 raise tokenizer.ParseErrorPreviousToken(
829 'Message type "%s" should not have multiple "%s" fields.' %
830 (message.DESCRIPTOR.full_name, field.name))
Brian Silverman9c614bc2016-02-15 20:20:02 -0500831 sub_message = getattr(message, field.name)
832 sub_message.SetInParent()
833
834 while not tokenizer.TryConsume(end_token):
835 if tokenizer.AtEnd():
Austin Schuh40c16522018-10-28 20:27:54 -0700836 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,))
837 self._MergeField(tokenizer, sub_message)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500838
839 if is_map_entry:
840 value_cpptype = field.message_type.fields_by_name['value'].cpp_type
841 if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
842 value = getattr(message, field.name)[sub_message.key]
843 value.MergeFrom(sub_message.value)
844 else:
845 getattr(message, field.name)[sub_message.key] = sub_message.value
Brian Silverman9c614bc2016-02-15 20:20:02 -0500846
Austin Schuh40c16522018-10-28 20:27:54 -0700847 @staticmethod
848 def _IsProto3Syntax(message):
849 message_descriptor = message.DESCRIPTOR
850 return (hasattr(message_descriptor, 'syntax') and
851 message_descriptor.syntax == 'proto3')
852
853 def _MergeScalarField(self, tokenizer, message, field):
854 """Merges a single scalar field into a message.
855
856 Args:
857 tokenizer: A tokenizer to parse the field value.
858 message: A protocol message to record the data.
859 field: The descriptor of the field to be merged.
860
861 Raises:
862 ParseError: In case of text parsing problems.
863 RuntimeError: On runtime errors.
864 """
865 _ = self.allow_unknown_extension
866 value = None
867
868 if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
869 descriptor.FieldDescriptor.TYPE_SINT32,
870 descriptor.FieldDescriptor.TYPE_SFIXED32):
871 value = _ConsumeInt32(tokenizer)
872 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
873 descriptor.FieldDescriptor.TYPE_SINT64,
874 descriptor.FieldDescriptor.TYPE_SFIXED64):
875 value = _ConsumeInt64(tokenizer)
876 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
877 descriptor.FieldDescriptor.TYPE_FIXED32):
878 value = _ConsumeUint32(tokenizer)
879 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
880 descriptor.FieldDescriptor.TYPE_FIXED64):
881 value = _ConsumeUint64(tokenizer)
882 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
883 descriptor.FieldDescriptor.TYPE_DOUBLE):
884 value = tokenizer.ConsumeFloat()
885 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
886 value = tokenizer.ConsumeBool()
887 elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
888 value = tokenizer.ConsumeString()
889 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
890 value = tokenizer.ConsumeByteString()
891 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
892 value = tokenizer.ConsumeEnum(field)
893 else:
894 raise RuntimeError('Unknown field type %d' % field.type)
895
896 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
897 if field.is_extension:
898 message.Extensions[field].append(value)
899 else:
900 getattr(message, field.name).append(value)
901 else:
902 # Proto3 doesn't represent presence so we can't test if multiple scalars
903 # have occurred. We have to allow them.
904 can_check_presence = not self._IsProto3Syntax(message)
905 if field.is_extension:
906 if (not self._allow_multiple_scalars and can_check_presence and
907 message.HasExtension(field)):
908 raise tokenizer.ParseErrorPreviousToken(
909 'Message type "%s" should not have multiple "%s" extensions.' %
910 (message.DESCRIPTOR.full_name, field.full_name))
911 else:
912 message.Extensions[field] = value
913 else:
914 if (not self._allow_multiple_scalars and can_check_presence and
915 message.HasField(field.name)):
916 raise tokenizer.ParseErrorPreviousToken(
917 'Message type "%s" should not have multiple "%s" fields.' %
918 (message.DESCRIPTOR.full_name, field.name))
919 else:
920 setattr(message, field.name, value)
Brian Silverman9c614bc2016-02-15 20:20:02 -0500921
922
923def _SkipFieldContents(tokenizer):
924 """Skips over contents (value or message) of a field.
925
926 Args:
927 tokenizer: A tokenizer to parse the field name and values.
928 """
929 # Try to guess the type of this field.
930 # If this field is not a message, there should be a ":" between the
931 # field name and the field value and also the field value should not
932 # start with "{" or "<" which indicates the beginning of a message body.
933 # If there is no ":" or there is a "{" or "<" after ":", this field has
934 # to be a message or the input is ill-formed.
935 if tokenizer.TryConsume(':') and not tokenizer.LookingAt(
936 '{') and not tokenizer.LookingAt('<'):
937 _SkipFieldValue(tokenizer)
938 else:
939 _SkipFieldMessage(tokenizer)
940
941
942def _SkipField(tokenizer):
943 """Skips over a complete field (name and value/message).
944
945 Args:
946 tokenizer: A tokenizer to parse the field name and values.
947 """
948 if tokenizer.TryConsume('['):
949 # Consume extension name.
950 tokenizer.ConsumeIdentifier()
951 while tokenizer.TryConsume('.'):
952 tokenizer.ConsumeIdentifier()
953 tokenizer.Consume(']')
954 else:
Austin Schuh40c16522018-10-28 20:27:54 -0700955 tokenizer.ConsumeIdentifierOrNumber()
Brian Silverman9c614bc2016-02-15 20:20:02 -0500956
957 _SkipFieldContents(tokenizer)
958
959 # For historical reasons, fields may optionally be separated by commas or
960 # semicolons.
961 if not tokenizer.TryConsume(','):
962 tokenizer.TryConsume(';')
963
964
965def _SkipFieldMessage(tokenizer):
966 """Skips over a field message.
967
968 Args:
969 tokenizer: A tokenizer to parse the field name and values.
970 """
971
972 if tokenizer.TryConsume('<'):
973 delimiter = '>'
974 else:
975 tokenizer.Consume('{')
976 delimiter = '}'
977
978 while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'):
979 _SkipField(tokenizer)
980
981 tokenizer.Consume(delimiter)
982
983
984def _SkipFieldValue(tokenizer):
985 """Skips over a field value.
986
987 Args:
988 tokenizer: A tokenizer to parse the field name and values.
989
990 Raises:
991 ParseError: In case an invalid field value is found.
992 """
Austin Schuh40c16522018-10-28 20:27:54 -0700993 # String/bytes tokens can come in multiple adjacent string literals.
Brian Silverman9c614bc2016-02-15 20:20:02 -0500994 # If we can consume one, consume as many as we can.
Austin Schuh40c16522018-10-28 20:27:54 -0700995 if tokenizer.TryConsumeByteString():
996 while tokenizer.TryConsumeByteString():
Brian Silverman9c614bc2016-02-15 20:20:02 -0500997 pass
998 return
999
1000 if (not tokenizer.TryConsumeIdentifier() and
Austin Schuh40c16522018-10-28 20:27:54 -07001001 not _TryConsumeInt64(tokenizer) and not _TryConsumeUint64(tokenizer) and
Brian Silverman9c614bc2016-02-15 20:20:02 -05001002 not tokenizer.TryConsumeFloat()):
1003 raise ParseError('Invalid field value: ' + tokenizer.token)
1004
1005
Austin Schuh40c16522018-10-28 20:27:54 -07001006class Tokenizer(object):
Brian Silverman9c614bc2016-02-15 20:20:02 -05001007 """Protocol buffer text representation tokenizer.
1008
1009 This class handles the lower level string parsing by splitting it into
1010 meaningful tokens.
1011
1012 It was directly ported from the Java protocol buffer API.
1013 """
1014
Austin Schuh40c16522018-10-28 20:27:54 -07001015 _WHITESPACE = re.compile(r'\s+')
1016 _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE)
1017 _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE)
Brian Silverman9c614bc2016-02-15 20:20:02 -05001018 _TOKEN = re.compile('|'.join([
Austin Schuh40c16522018-10-28 20:27:54 -07001019 r'[a-zA-Z_][0-9a-zA-Z_+-]*', # an identifier
Brian Silverman9c614bc2016-02-15 20:20:02 -05001020 r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*', # a number
Austin Schuh40c16522018-10-28 20:27:54 -07001021 ] + [ # quoted str for each quote mark
Brian Silverman9c614bc2016-02-15 20:20:02 -05001022 r'{qt}([^{qt}\n\\]|\\.)*({qt}|\\?$)'.format(qt=mark) for mark in _QUOTES
1023 ]))
1024
Austin Schuh40c16522018-10-28 20:27:54 -07001025 _IDENTIFIER = re.compile(r'[^\d\W]\w*')
1026 _IDENTIFIER_OR_NUMBER = re.compile(r'\w+')
Brian Silverman9c614bc2016-02-15 20:20:02 -05001027
Austin Schuh40c16522018-10-28 20:27:54 -07001028 def __init__(self, lines, skip_comments=True):
Brian Silverman9c614bc2016-02-15 20:20:02 -05001029 self._position = 0
1030 self._line = -1
1031 self._column = 0
1032 self._token_start = None
1033 self.token = ''
1034 self._lines = iter(lines)
1035 self._current_line = ''
1036 self._previous_line = 0
1037 self._previous_column = 0
1038 self._more_lines = True
Austin Schuh40c16522018-10-28 20:27:54 -07001039 self._skip_comments = skip_comments
1040 self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT
1041 or self._WHITESPACE)
Brian Silverman9c614bc2016-02-15 20:20:02 -05001042 self._SkipWhitespace()
1043 self.NextToken()
1044
1045 def LookingAt(self, token):
1046 return self.token == token
1047
1048 def AtEnd(self):
1049 """Checks the end of the text was reached.
1050
1051 Returns:
1052 True iff the end was reached.
1053 """
1054 return not self.token
1055
1056 def _PopLine(self):
1057 while len(self._current_line) <= self._column:
1058 try:
1059 self._current_line = next(self._lines)
1060 except StopIteration:
1061 self._current_line = ''
1062 self._more_lines = False
1063 return
1064 else:
1065 self._line += 1
1066 self._column = 0
1067
1068 def _SkipWhitespace(self):
1069 while True:
1070 self._PopLine()
Austin Schuh40c16522018-10-28 20:27:54 -07001071 match = self._whitespace_pattern.match(self._current_line, self._column)
Brian Silverman9c614bc2016-02-15 20:20:02 -05001072 if not match:
1073 break
1074 length = len(match.group(0))
1075 self._column += length
1076
1077 def TryConsume(self, token):
1078 """Tries to consume a given piece of text.
1079
1080 Args:
1081 token: Text to consume.
1082
1083 Returns:
1084 True iff the text was consumed.
1085 """
1086 if self.token == token:
1087 self.NextToken()
1088 return True
1089 return False
1090
1091 def Consume(self, token):
1092 """Consumes a piece of text.
1093
1094 Args:
1095 token: Text to consume.
1096
1097 Raises:
1098 ParseError: If the text couldn't be consumed.
1099 """
1100 if not self.TryConsume(token):
Austin Schuh40c16522018-10-28 20:27:54 -07001101 raise self.ParseError('Expected "%s".' % token)
1102
1103 def ConsumeComment(self):
1104 result = self.token
1105 if not self._COMMENT.match(result):
1106 raise self.ParseError('Expected comment.')
1107 self.NextToken()
1108 return result
1109
1110 def ConsumeCommentOrTrailingComment(self):
1111 """Consumes a comment, returns a 2-tuple (trailing bool, comment str)."""
1112
1113 # Tokenizer initializes _previous_line and _previous_column to 0. As the
1114 # tokenizer starts, it looks like there is a previous token on the line.
1115 just_started = self._line == 0 and self._column == 0
1116
1117 before_parsing = self._previous_line
1118 comment = self.ConsumeComment()
1119
1120 # A trailing comment is a comment on the same line than the previous token.
1121 trailing = (self._previous_line == before_parsing
1122 and not just_started)
1123
1124 return trailing, comment
Brian Silverman9c614bc2016-02-15 20:20:02 -05001125
1126 def TryConsumeIdentifier(self):
1127 try:
1128 self.ConsumeIdentifier()
1129 return True
1130 except ParseError:
1131 return False
1132
1133 def ConsumeIdentifier(self):
1134 """Consumes protocol message field identifier.
1135
1136 Returns:
1137 Identifier string.
1138
1139 Raises:
1140 ParseError: If an identifier couldn't be consumed.
1141 """
1142 result = self.token
1143 if not self._IDENTIFIER.match(result):
Austin Schuh40c16522018-10-28 20:27:54 -07001144 raise self.ParseError('Expected identifier.')
Brian Silverman9c614bc2016-02-15 20:20:02 -05001145 self.NextToken()
1146 return result
1147
Austin Schuh40c16522018-10-28 20:27:54 -07001148 def TryConsumeIdentifierOrNumber(self):
Brian Silverman9c614bc2016-02-15 20:20:02 -05001149 try:
Austin Schuh40c16522018-10-28 20:27:54 -07001150 self.ConsumeIdentifierOrNumber()
Brian Silverman9c614bc2016-02-15 20:20:02 -05001151 return True
1152 except ParseError:
1153 return False
1154
Austin Schuh40c16522018-10-28 20:27:54 -07001155 def ConsumeIdentifierOrNumber(self):
1156 """Consumes protocol message field identifier.
Brian Silverman9c614bc2016-02-15 20:20:02 -05001157
1158 Returns:
Austin Schuh40c16522018-10-28 20:27:54 -07001159 Identifier string.
Brian Silverman9c614bc2016-02-15 20:20:02 -05001160
1161 Raises:
Austin Schuh40c16522018-10-28 20:27:54 -07001162 ParseError: If an identifier couldn't be consumed.
Brian Silverman9c614bc2016-02-15 20:20:02 -05001163 """
Austin Schuh40c16522018-10-28 20:27:54 -07001164 result = self.token
1165 if not self._IDENTIFIER_OR_NUMBER.match(result):
1166 raise self.ParseError('Expected identifier or number, got %s.' % result)
Brian Silverman9c614bc2016-02-15 20:20:02 -05001167 self.NextToken()
1168 return result
1169
Austin Schuh40c16522018-10-28 20:27:54 -07001170 def TryConsumeInteger(self):
Brian Silverman9c614bc2016-02-15 20:20:02 -05001171 try:
Austin Schuh40c16522018-10-28 20:27:54 -07001172 # Note: is_long only affects value type, not whether an error is raised.
1173 self.ConsumeInteger()
Brian Silverman9c614bc2016-02-15 20:20:02 -05001174 return True
1175 except ParseError:
1176 return False
1177
Austin Schuh40c16522018-10-28 20:27:54 -07001178 def ConsumeInteger(self, is_long=False):
1179 """Consumes an integer number.
Brian Silverman9c614bc2016-02-15 20:20:02 -05001180
Austin Schuh40c16522018-10-28 20:27:54 -07001181 Args:
1182 is_long: True if the value should be returned as a long integer.
Brian Silverman9c614bc2016-02-15 20:20:02 -05001183 Returns:
1184 The integer parsed.
1185
1186 Raises:
Austin Schuh40c16522018-10-28 20:27:54 -07001187 ParseError: If an integer couldn't be consumed.
Brian Silverman9c614bc2016-02-15 20:20:02 -05001188 """
1189 try:
Austin Schuh40c16522018-10-28 20:27:54 -07001190 result = _ParseAbstractInteger(self.token, is_long=is_long)
Brian Silverman9c614bc2016-02-15 20:20:02 -05001191 except ValueError as e:
Austin Schuh40c16522018-10-28 20:27:54 -07001192 raise self.ParseError(str(e))
Brian Silverman9c614bc2016-02-15 20:20:02 -05001193 self.NextToken()
1194 return result
1195
1196 def TryConsumeFloat(self):
1197 try:
1198 self.ConsumeFloat()
1199 return True
1200 except ParseError:
1201 return False
1202
1203 def ConsumeFloat(self):
1204 """Consumes an floating point number.
1205
1206 Returns:
1207 The number parsed.
1208
1209 Raises:
1210 ParseError: If a floating point number couldn't be consumed.
1211 """
1212 try:
1213 result = ParseFloat(self.token)
1214 except ValueError as e:
Austin Schuh40c16522018-10-28 20:27:54 -07001215 raise self.ParseError(str(e))
Brian Silverman9c614bc2016-02-15 20:20:02 -05001216 self.NextToken()
1217 return result
1218
1219 def ConsumeBool(self):
1220 """Consumes a boolean value.
1221
1222 Returns:
1223 The bool parsed.
1224
1225 Raises:
1226 ParseError: If a boolean value couldn't be consumed.
1227 """
1228 try:
1229 result = ParseBool(self.token)
1230 except ValueError as e:
Austin Schuh40c16522018-10-28 20:27:54 -07001231 raise self.ParseError(str(e))
Brian Silverman9c614bc2016-02-15 20:20:02 -05001232 self.NextToken()
1233 return result
1234
Austin Schuh40c16522018-10-28 20:27:54 -07001235 def TryConsumeByteString(self):
Brian Silverman9c614bc2016-02-15 20:20:02 -05001236 try:
Austin Schuh40c16522018-10-28 20:27:54 -07001237 self.ConsumeByteString()
Brian Silverman9c614bc2016-02-15 20:20:02 -05001238 return True
1239 except ParseError:
1240 return False
1241
1242 def ConsumeString(self):
1243 """Consumes a string value.
1244
1245 Returns:
1246 The string parsed.
1247
1248 Raises:
1249 ParseError: If a string value couldn't be consumed.
1250 """
1251 the_bytes = self.ConsumeByteString()
1252 try:
1253 return six.text_type(the_bytes, 'utf-8')
1254 except UnicodeDecodeError as e:
1255 raise self._StringParseError(e)
1256
1257 def ConsumeByteString(self):
1258 """Consumes a byte array value.
1259
1260 Returns:
1261 The array parsed (as a string).
1262
1263 Raises:
1264 ParseError: If a byte array value couldn't be consumed.
1265 """
1266 the_list = [self._ConsumeSingleByteString()]
1267 while self.token and self.token[0] in _QUOTES:
1268 the_list.append(self._ConsumeSingleByteString())
1269 return b''.join(the_list)
1270
1271 def _ConsumeSingleByteString(self):
1272 """Consume one token of a string literal.
1273
1274 String literals (whether bytes or text) can come in multiple adjacent
1275 tokens which are automatically concatenated, like in C or Python. This
1276 method only consumes one token.
1277
1278 Returns:
1279 The token parsed.
1280 Raises:
1281 ParseError: When the wrong format data is found.
1282 """
1283 text = self.token
1284 if len(text) < 1 or text[0] not in _QUOTES:
Austin Schuh40c16522018-10-28 20:27:54 -07001285 raise self.ParseError('Expected string but found: %r' % (text,))
Brian Silverman9c614bc2016-02-15 20:20:02 -05001286
1287 if len(text) < 2 or text[-1] != text[0]:
Austin Schuh40c16522018-10-28 20:27:54 -07001288 raise self.ParseError('String missing ending quote: %r' % (text,))
Brian Silverman9c614bc2016-02-15 20:20:02 -05001289
1290 try:
1291 result = text_encoding.CUnescape(text[1:-1])
1292 except ValueError as e:
Austin Schuh40c16522018-10-28 20:27:54 -07001293 raise self.ParseError(str(e))
Brian Silverman9c614bc2016-02-15 20:20:02 -05001294 self.NextToken()
1295 return result
1296
1297 def ConsumeEnum(self, field):
1298 try:
1299 result = ParseEnum(field, self.token)
1300 except ValueError as e:
Austin Schuh40c16522018-10-28 20:27:54 -07001301 raise self.ParseError(str(e))
Brian Silverman9c614bc2016-02-15 20:20:02 -05001302 self.NextToken()
1303 return result
1304
1305 def ParseErrorPreviousToken(self, message):
1306 """Creates and *returns* a ParseError for the previously read token.
1307
1308 Args:
1309 message: A message to set for the exception.
1310
1311 Returns:
1312 A ParseError instance.
1313 """
Austin Schuh40c16522018-10-28 20:27:54 -07001314 return ParseError(message, self._previous_line + 1,
1315 self._previous_column + 1)
Brian Silverman9c614bc2016-02-15 20:20:02 -05001316
Austin Schuh40c16522018-10-28 20:27:54 -07001317 def ParseError(self, message):
Brian Silverman9c614bc2016-02-15 20:20:02 -05001318 """Creates and *returns* a ParseError for the current token."""
Austin Schuh40c16522018-10-28 20:27:54 -07001319 return ParseError(message, self._line + 1, self._column + 1)
Brian Silverman9c614bc2016-02-15 20:20:02 -05001320
1321 def _StringParseError(self, e):
Austin Schuh40c16522018-10-28 20:27:54 -07001322 return self.ParseError('Couldn\'t parse string: ' + str(e))
Brian Silverman9c614bc2016-02-15 20:20:02 -05001323
1324 def NextToken(self):
1325 """Reads the next meaningful token."""
1326 self._previous_line = self._line
1327 self._previous_column = self._column
1328
1329 self._column += len(self.token)
1330 self._SkipWhitespace()
1331
1332 if not self._more_lines:
1333 self.token = ''
1334 return
1335
1336 match = self._TOKEN.match(self._current_line, self._column)
Austin Schuh40c16522018-10-28 20:27:54 -07001337 if not match and not self._skip_comments:
1338 match = self._COMMENT.match(self._current_line, self._column)
Brian Silverman9c614bc2016-02-15 20:20:02 -05001339 if match:
1340 token = match.group(0)
1341 self.token = token
1342 else:
1343 self.token = self._current_line[self._column]
1344
Austin Schuh40c16522018-10-28 20:27:54 -07001345# Aliased so it can still be accessed by current visibility violators.
1346# TODO(dbarnett): Migrate violators to textformat_tokenizer.
1347_Tokenizer = Tokenizer # pylint: disable=invalid-name
1348
1349
1350def _ConsumeInt32(tokenizer):
1351 """Consumes a signed 32bit integer number from tokenizer.
1352
1353 Args:
1354 tokenizer: A tokenizer used to parse the number.
1355
1356 Returns:
1357 The integer parsed.
1358
1359 Raises:
1360 ParseError: If a signed 32bit integer couldn't be consumed.
1361 """
1362 return _ConsumeInteger(tokenizer, is_signed=True, is_long=False)
1363
1364
1365def _ConsumeUint32(tokenizer):
1366 """Consumes an unsigned 32bit integer number from tokenizer.
1367
1368 Args:
1369 tokenizer: A tokenizer used to parse the number.
1370
1371 Returns:
1372 The integer parsed.
1373
1374 Raises:
1375 ParseError: If an unsigned 32bit integer couldn't be consumed.
1376 """
1377 return _ConsumeInteger(tokenizer, is_signed=False, is_long=False)
1378
1379
1380def _TryConsumeInt64(tokenizer):
1381 try:
1382 _ConsumeInt64(tokenizer)
1383 return True
1384 except ParseError:
1385 return False
1386
1387
1388def _ConsumeInt64(tokenizer):
1389 """Consumes a signed 32bit integer number from tokenizer.
1390
1391 Args:
1392 tokenizer: A tokenizer used to parse the number.
1393
1394 Returns:
1395 The integer parsed.
1396
1397 Raises:
1398 ParseError: If a signed 32bit integer couldn't be consumed.
1399 """
1400 return _ConsumeInteger(tokenizer, is_signed=True, is_long=True)
1401
1402
1403def _TryConsumeUint64(tokenizer):
1404 try:
1405 _ConsumeUint64(tokenizer)
1406 return True
1407 except ParseError:
1408 return False
1409
1410
1411def _ConsumeUint64(tokenizer):
1412 """Consumes an unsigned 64bit integer number from tokenizer.
1413
1414 Args:
1415 tokenizer: A tokenizer used to parse the number.
1416
1417 Returns:
1418 The integer parsed.
1419
1420 Raises:
1421 ParseError: If an unsigned 64bit integer couldn't be consumed.
1422 """
1423 return _ConsumeInteger(tokenizer, is_signed=False, is_long=True)
1424
1425
1426def _TryConsumeInteger(tokenizer, is_signed=False, is_long=False):
1427 try:
1428 _ConsumeInteger(tokenizer, is_signed=is_signed, is_long=is_long)
1429 return True
1430 except ParseError:
1431 return False
1432
1433
1434def _ConsumeInteger(tokenizer, is_signed=False, is_long=False):
1435 """Consumes an integer number from tokenizer.
1436
1437 Args:
1438 tokenizer: A tokenizer used to parse the number.
1439 is_signed: True if a signed integer must be parsed.
1440 is_long: True if a long integer must be parsed.
1441
1442 Returns:
1443 The integer parsed.
1444
1445 Raises:
1446 ParseError: If an integer with given characteristics couldn't be consumed.
1447 """
1448 try:
1449 result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long)
1450 except ValueError as e:
1451 raise tokenizer.ParseError(str(e))
1452 tokenizer.NextToken()
1453 return result
1454
Brian Silverman9c614bc2016-02-15 20:20:02 -05001455
1456def ParseInteger(text, is_signed=False, is_long=False):
1457 """Parses an integer.
1458
1459 Args:
1460 text: The text to parse.
1461 is_signed: True if a signed integer must be parsed.
1462 is_long: True if a long integer must be parsed.
1463
1464 Returns:
1465 The integer value.
1466
1467 Raises:
1468 ValueError: Thrown Iff the text is not a valid integer.
1469 """
1470 # Do the actual parsing. Exception handling is propagated to caller.
Austin Schuh40c16522018-10-28 20:27:54 -07001471 result = _ParseAbstractInteger(text, is_long=is_long)
Brian Silverman9c614bc2016-02-15 20:20:02 -05001472
1473 # Check if the integer is sane. Exceptions handled by callers.
1474 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
1475 checker.CheckValue(result)
1476 return result
1477
1478
Austin Schuh40c16522018-10-28 20:27:54 -07001479def _ParseAbstractInteger(text, is_long=False):
1480 """Parses an integer without checking size/signedness.
1481
1482 Args:
1483 text: The text to parse.
1484 is_long: True if the value should be returned as a long integer.
1485
1486 Returns:
1487 The integer value.
1488
1489 Raises:
1490 ValueError: Thrown Iff the text is not a valid integer.
1491 """
1492 # Do the actual parsing. Exception handling is propagated to caller.
1493 try:
1494 # We force 32-bit values to int and 64-bit values to long to make
1495 # alternate implementations where the distinction is more significant
1496 # (e.g. the C++ implementation) simpler.
1497 if is_long:
1498 return long(text, 0)
1499 else:
1500 return int(text, 0)
1501 except ValueError:
1502 raise ValueError('Couldn\'t parse integer: %s' % text)
1503
1504
Brian Silverman9c614bc2016-02-15 20:20:02 -05001505def ParseFloat(text):
1506 """Parse a floating point number.
1507
1508 Args:
1509 text: Text to parse.
1510
1511 Returns:
1512 The number parsed.
1513
1514 Raises:
1515 ValueError: If a floating point number couldn't be parsed.
1516 """
1517 try:
1518 # Assume Python compatible syntax.
1519 return float(text)
1520 except ValueError:
1521 # Check alternative spellings.
1522 if _FLOAT_INFINITY.match(text):
1523 if text[0] == '-':
1524 return float('-inf')
1525 else:
1526 return float('inf')
1527 elif _FLOAT_NAN.match(text):
1528 return float('nan')
1529 else:
1530 # assume '1.0f' format
1531 try:
1532 return float(text.rstrip('f'))
1533 except ValueError:
1534 raise ValueError('Couldn\'t parse float: %s' % text)
1535
1536
1537def ParseBool(text):
1538 """Parse a boolean value.
1539
1540 Args:
1541 text: Text to parse.
1542
1543 Returns:
1544 Boolean values parsed
1545
1546 Raises:
1547 ValueError: If text is not a valid boolean.
1548 """
Austin Schuh40c16522018-10-28 20:27:54 -07001549 if text in ('true', 't', '1', 'True'):
Brian Silverman9c614bc2016-02-15 20:20:02 -05001550 return True
Austin Schuh40c16522018-10-28 20:27:54 -07001551 elif text in ('false', 'f', '0', 'False'):
Brian Silverman9c614bc2016-02-15 20:20:02 -05001552 return False
1553 else:
1554 raise ValueError('Expected "true" or "false".')
1555
1556
1557def ParseEnum(field, value):
1558 """Parse an enum value.
1559
1560 The value can be specified by a number (the enum value), or by
1561 a string literal (the enum name).
1562
1563 Args:
1564 field: Enum field descriptor.
1565 value: String value.
1566
1567 Returns:
1568 Enum value number.
1569
1570 Raises:
1571 ValueError: If the enum value could not be parsed.
1572 """
1573 enum_descriptor = field.enum_type
1574 try:
1575 number = int(value, 0)
1576 except ValueError:
1577 # Identifier.
1578 enum_value = enum_descriptor.values_by_name.get(value, None)
1579 if enum_value is None:
Austin Schuh40c16522018-10-28 20:27:54 -07001580 raise ValueError('Enum type "%s" has no value named %s.' %
1581 (enum_descriptor.full_name, value))
Brian Silverman9c614bc2016-02-15 20:20:02 -05001582 else:
1583 # Numeric value.
Austin Schuh40c16522018-10-28 20:27:54 -07001584 if hasattr(field.file, 'syntax'):
1585 # Attribute is checked for compatibility.
1586 if field.file.syntax == 'proto3':
1587 # Proto3 accept numeric unknown enums.
1588 return number
Brian Silverman9c614bc2016-02-15 20:20:02 -05001589 enum_value = enum_descriptor.values_by_number.get(number, None)
1590 if enum_value is None:
Austin Schuh40c16522018-10-28 20:27:54 -07001591 raise ValueError('Enum type "%s" has no value with number %d.' %
1592 (enum_descriptor.full_name, number))
Brian Silverman9c614bc2016-02-15 20:20:02 -05001593 return enum_value.number