blob: 8d256076c28ad4c3245aaccbbbd8284aeb4bc961 [file] [log] [blame]
Brian Silverman9c614bc2016-02-15 20:20:02 -05001# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc. All rights reserved.
3# https://developers.google.com/protocol-buffers/
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9# * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11# * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15# * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""Contains routines for printing protocol messages in text format.
32
33Simple usage example:
34
35 # Create a proto object and serialize it to a text proto string.
36 message = my_proto_pb2.MyMessage(foo='bar')
37 text_proto = text_format.MessageToString(message)
38
39 # Parse a text proto string.
40 message = text_format.Parse(text_proto, my_proto_pb2.MyMessage())
41"""
42
43__author__ = 'kenton@google.com (Kenton Varda)'
44
45import io
46import re
47
48import six
49
50if six.PY3:
51 long = int
52
53from google.protobuf.internal import type_checkers
54from google.protobuf import descriptor
55from google.protobuf import text_encoding
56
57__all__ = ['MessageToString', 'PrintMessage', 'PrintField',
58 'PrintFieldValue', 'Merge']
59
60
61_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
62 type_checkers.Int32ValueChecker(),
63 type_checkers.Uint64ValueChecker(),
64 type_checkers.Int64ValueChecker())
65_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?', re.IGNORECASE)
66_FLOAT_NAN = re.compile('nanf?', re.IGNORECASE)
67_FLOAT_TYPES = frozenset([descriptor.FieldDescriptor.CPPTYPE_FLOAT,
68 descriptor.FieldDescriptor.CPPTYPE_DOUBLE])
69_QUOTES = frozenset(("'", '"'))
70
71
72class Error(Exception):
73 """Top-level module error for text_format."""
74
75
76class ParseError(Error):
77 """Thrown in case of text parsing error."""
78
79
80class TextWriter(object):
81 def __init__(self, as_utf8):
82 if six.PY2:
83 self._writer = io.BytesIO()
84 else:
85 self._writer = io.StringIO()
86
87 def write(self, val):
88 if six.PY2:
89 if isinstance(val, six.text_type):
90 val = val.encode('utf-8')
91 return self._writer.write(val)
92
93 def close(self):
94 return self._writer.close()
95
96 def getvalue(self):
97 return self._writer.getvalue()
98
99
100def MessageToString(message, as_utf8=False, as_one_line=False,
101 pointy_brackets=False, use_index_order=False,
102 float_format=None):
103 """Convert protobuf message to text format.
104
105 Floating point values can be formatted compactly with 15 digits of
106 precision (which is the most that IEEE 754 "double" can guarantee)
107 using float_format='.15g'. To ensure that converting to text and back to a
108 proto will result in an identical value, float_format='.17g' should be used.
109
110 Args:
111 message: The protocol buffers message.
112 as_utf8: Produce text output in UTF8 format.
113 as_one_line: Don't introduce newlines between fields.
114 pointy_brackets: If True, use angle brackets instead of curly braces for
115 nesting.
116 use_index_order: If True, print fields of a proto message using the order
117 defined in source code instead of the field number. By default, use the
118 field number order.
119 float_format: If set, use this to specify floating point number formatting
120 (per the "Format Specification Mini-Language"); otherwise, str() is used.
121
122 Returns:
123 A string of the text formatted protocol buffer message.
124 """
125 out = TextWriter(as_utf8)
126 PrintMessage(message, out, as_utf8=as_utf8, as_one_line=as_one_line,
127 pointy_brackets=pointy_brackets,
128 use_index_order=use_index_order,
129 float_format=float_format)
130 result = out.getvalue()
131 out.close()
132 if as_one_line:
133 return result.rstrip()
134 return result
135
136
137def _IsMapEntry(field):
138 return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
139 field.message_type.has_options and
140 field.message_type.GetOptions().map_entry)
141
142
143def PrintMessage(message, out, indent=0, as_utf8=False, as_one_line=False,
144 pointy_brackets=False, use_index_order=False,
145 float_format=None):
146 fields = message.ListFields()
147 if use_index_order:
148 fields.sort(key=lambda x: x[0].index)
149 for field, value in fields:
150 if _IsMapEntry(field):
151 for key in sorted(value):
152 # This is slow for maps with submessage entires because it copies the
153 # entire tree. Unfortunately this would take significant refactoring
154 # of this file to work around.
155 #
156 # TODO(haberman): refactor and optimize if this becomes an issue.
157 entry_submsg = field.message_type._concrete_class(
158 key=key, value=value[key])
159 PrintField(field, entry_submsg, out, indent, as_utf8, as_one_line,
160 pointy_brackets=pointy_brackets,
161 use_index_order=use_index_order, float_format=float_format)
162 elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
163 for element in value:
164 PrintField(field, element, out, indent, as_utf8, as_one_line,
165 pointy_brackets=pointy_brackets,
166 use_index_order=use_index_order,
167 float_format=float_format)
168 else:
169 PrintField(field, value, out, indent, as_utf8, as_one_line,
170 pointy_brackets=pointy_brackets,
171 use_index_order=use_index_order,
172 float_format=float_format)
173
174
175def PrintField(field, value, out, indent=0, as_utf8=False, as_one_line=False,
176 pointy_brackets=False, use_index_order=False, float_format=None):
177 """Print a single field name/value pair. For repeated fields, the value
178 should be a single element.
179 """
180
181 out.write(' ' * indent)
182 if field.is_extension:
183 out.write('[')
184 if (field.containing_type.GetOptions().message_set_wire_format and
185 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
186 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
187 out.write(field.message_type.full_name)
188 else:
189 out.write(field.full_name)
190 out.write(']')
191 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
192 # For groups, use the capitalized name.
193 out.write(field.message_type.name)
194 else:
195 out.write(field.name)
196
197 if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
198 # The colon is optional in this case, but our cross-language golden files
199 # don't include it.
200 out.write(': ')
201
202 PrintFieldValue(field, value, out, indent, as_utf8, as_one_line,
203 pointy_brackets=pointy_brackets,
204 use_index_order=use_index_order,
205 float_format=float_format)
206 if as_one_line:
207 out.write(' ')
208 else:
209 out.write('\n')
210
211
212def PrintFieldValue(field, value, out, indent=0, as_utf8=False,
213 as_one_line=False, pointy_brackets=False,
214 use_index_order=False,
215 float_format=None):
216 """Print a single field value (not including name). For repeated fields,
217 the value should be a single element."""
218
219 if pointy_brackets:
220 openb = '<'
221 closeb = '>'
222 else:
223 openb = '{'
224 closeb = '}'
225
226 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
227 if as_one_line:
228 out.write(' %s ' % openb)
229 PrintMessage(value, out, indent, as_utf8, as_one_line,
230 pointy_brackets=pointy_brackets,
231 use_index_order=use_index_order,
232 float_format=float_format)
233 out.write(closeb)
234 else:
235 out.write(' %s\n' % openb)
236 PrintMessage(value, out, indent + 2, as_utf8, as_one_line,
237 pointy_brackets=pointy_brackets,
238 use_index_order=use_index_order,
239 float_format=float_format)
240 out.write(' ' * indent + closeb)
241 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
242 enum_value = field.enum_type.values_by_number.get(value, None)
243 if enum_value is not None:
244 out.write(enum_value.name)
245 else:
246 out.write(str(value))
247 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
248 out.write('\"')
249 if isinstance(value, six.text_type):
250 out_value = value.encode('utf-8')
251 else:
252 out_value = value
253 if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
254 # We need to escape non-UTF8 chars in TYPE_BYTES field.
255 out_as_utf8 = False
256 else:
257 out_as_utf8 = as_utf8
258 out.write(text_encoding.CEscape(out_value, out_as_utf8))
259 out.write('\"')
260 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
261 if value:
262 out.write('true')
263 else:
264 out.write('false')
265 elif field.cpp_type in _FLOAT_TYPES and float_format is not None:
266 out.write('{1:{0}}'.format(float_format, value))
267 else:
268 out.write(str(value))
269
270
271def Parse(text, message, allow_unknown_extension=False):
272 """Parses an text representation of a protocol message into a message.
273
274 Args:
275 text: Message text representation.
276 message: A protocol buffer message to merge into.
277 allow_unknown_extension: if True, skip over missing extensions and keep
278 parsing
279
280 Returns:
281 The same message passed as argument.
282
283 Raises:
284 ParseError: On text parsing problems.
285 """
286 if not isinstance(text, str):
287 text = text.decode('utf-8')
288 return ParseLines(text.split('\n'), message, allow_unknown_extension)
289
290
291def Merge(text, message, allow_unknown_extension=False):
292 """Parses an text representation of a protocol message into a message.
293
294 Like Parse(), but allows repeated values for a non-repeated field, and uses
295 the last one.
296
297 Args:
298 text: Message text representation.
299 message: A protocol buffer message to merge into.
300 allow_unknown_extension: if True, skip over missing extensions and keep
301 parsing
302
303 Returns:
304 The same message passed as argument.
305
306 Raises:
307 ParseError: On text parsing problems.
308 """
309 return MergeLines(text.split('\n'), message, allow_unknown_extension)
310
311
312def ParseLines(lines, message, allow_unknown_extension=False):
313 """Parses an text representation of a protocol message into a message.
314
315 Args:
316 lines: An iterable of lines of a message's text representation.
317 message: A protocol buffer message to merge into.
318 allow_unknown_extension: if True, skip over missing extensions and keep
319 parsing
320
321 Returns:
322 The same message passed as argument.
323
324 Raises:
325 ParseError: On text parsing problems.
326 """
327 _ParseOrMerge(lines, message, False, allow_unknown_extension)
328 return message
329
330
331def MergeLines(lines, message, allow_unknown_extension=False):
332 """Parses an text representation of a protocol message into a message.
333
334 Args:
335 lines: An iterable of lines of a message's text representation.
336 message: A protocol buffer message to merge into.
337 allow_unknown_extension: if True, skip over missing extensions and keep
338 parsing
339
340 Returns:
341 The same message passed as argument.
342
343 Raises:
344 ParseError: On text parsing problems.
345 """
346 _ParseOrMerge(lines, message, True, allow_unknown_extension)
347 return message
348
349
350def _ParseOrMerge(lines,
351 message,
352 allow_multiple_scalars,
353 allow_unknown_extension=False):
354 """Converts an text representation of a protocol message into a message.
355
356 Args:
357 lines: Lines of a message's text representation.
358 message: A protocol buffer message to merge into.
359 allow_multiple_scalars: Determines if repeated values for a non-repeated
360 field are permitted, e.g., the string "foo: 1 foo: 2" for a
361 required/optional field named "foo".
362 allow_unknown_extension: if True, skip over missing extensions and keep
363 parsing
364
365 Raises:
366 ParseError: On text parsing problems.
367 """
368 tokenizer = _Tokenizer(lines)
369 while not tokenizer.AtEnd():
370 _MergeField(tokenizer, message, allow_multiple_scalars,
371 allow_unknown_extension)
372
373
374def _MergeField(tokenizer,
375 message,
376 allow_multiple_scalars,
377 allow_unknown_extension=False):
378 """Merges a single protocol message field into a message.
379
380 Args:
381 tokenizer: A tokenizer to parse the field name and values.
382 message: A protocol message to record the data.
383 allow_multiple_scalars: Determines if repeated values for a non-repeated
384 field are permitted, e.g., the string "foo: 1 foo: 2" for a
385 required/optional field named "foo".
386 allow_unknown_extension: if True, skip over missing extensions and keep
387 parsing
388
389 Raises:
390 ParseError: In case of text parsing problems.
391 """
392 message_descriptor = message.DESCRIPTOR
393 if (hasattr(message_descriptor, 'syntax') and
394 message_descriptor.syntax == 'proto3'):
395 # Proto3 doesn't represent presence so we can't test if multiple
396 # scalars have occurred. We have to allow them.
397 allow_multiple_scalars = True
398 if tokenizer.TryConsume('['):
399 name = [tokenizer.ConsumeIdentifier()]
400 while tokenizer.TryConsume('.'):
401 name.append(tokenizer.ConsumeIdentifier())
402 name = '.'.join(name)
403
404 if not message_descriptor.is_extendable:
405 raise tokenizer.ParseErrorPreviousToken(
406 'Message type "%s" does not have extensions.' %
407 message_descriptor.full_name)
408 # pylint: disable=protected-access
409 field = message.Extensions._FindExtensionByName(name)
410 # pylint: enable=protected-access
411 if not field:
412 if allow_unknown_extension:
413 field = None
414 else:
415 raise tokenizer.ParseErrorPreviousToken(
416 'Extension "%s" not registered.' % name)
417 elif message_descriptor != field.containing_type:
418 raise tokenizer.ParseErrorPreviousToken(
419 'Extension "%s" does not extend message type "%s".' % (
420 name, message_descriptor.full_name))
421
422 tokenizer.Consume(']')
423
424 else:
425 name = tokenizer.ConsumeIdentifier()
426 field = message_descriptor.fields_by_name.get(name, None)
427
428 # Group names are expected to be capitalized as they appear in the
429 # .proto file, which actually matches their type names, not their field
430 # names.
431 if not field:
432 field = message_descriptor.fields_by_name.get(name.lower(), None)
433 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
434 field = None
435
436 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
437 field.message_type.name != name):
438 field = None
439
440 if not field:
441 raise tokenizer.ParseErrorPreviousToken(
442 'Message type "%s" has no field named "%s".' % (
443 message_descriptor.full_name, name))
444
445 if field and field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
446 is_map_entry = _IsMapEntry(field)
447 tokenizer.TryConsume(':')
448
449 if tokenizer.TryConsume('<'):
450 end_token = '>'
451 else:
452 tokenizer.Consume('{')
453 end_token = '}'
454
455 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
456 if field.is_extension:
457 sub_message = message.Extensions[field].add()
458 elif is_map_entry:
459 sub_message = field.message_type._concrete_class()
460 else:
461 sub_message = getattr(message, field.name).add()
462 else:
463 if field.is_extension:
464 sub_message = message.Extensions[field]
465 else:
466 sub_message = getattr(message, field.name)
467 sub_message.SetInParent()
468
469 while not tokenizer.TryConsume(end_token):
470 if tokenizer.AtEnd():
471 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token))
472 _MergeField(tokenizer, sub_message, allow_multiple_scalars,
473 allow_unknown_extension)
474
475 if is_map_entry:
476 value_cpptype = field.message_type.fields_by_name['value'].cpp_type
477 if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
478 value = getattr(message, field.name)[sub_message.key]
479 value.MergeFrom(sub_message.value)
480 else:
481 getattr(message, field.name)[sub_message.key] = sub_message.value
482 elif field:
483 tokenizer.Consume(':')
484 if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and
485 tokenizer.TryConsume('[')):
486 # Short repeated format, e.g. "foo: [1, 2, 3]"
487 while True:
488 _MergeScalarField(tokenizer, message, field, allow_multiple_scalars)
489 if tokenizer.TryConsume(']'):
490 break
491 tokenizer.Consume(',')
492 else:
493 _MergeScalarField(tokenizer, message, field, allow_multiple_scalars)
494 else: # Proto field is unknown.
495 assert allow_unknown_extension
496 _SkipFieldContents(tokenizer)
497
498 # For historical reasons, fields may optionally be separated by commas or
499 # semicolons.
500 if not tokenizer.TryConsume(','):
501 tokenizer.TryConsume(';')
502
503
504def _SkipFieldContents(tokenizer):
505 """Skips over contents (value or message) of a field.
506
507 Args:
508 tokenizer: A tokenizer to parse the field name and values.
509 """
510 # Try to guess the type of this field.
511 # If this field is not a message, there should be a ":" between the
512 # field name and the field value and also the field value should not
513 # start with "{" or "<" which indicates the beginning of a message body.
514 # If there is no ":" or there is a "{" or "<" after ":", this field has
515 # to be a message or the input is ill-formed.
516 if tokenizer.TryConsume(':') and not tokenizer.LookingAt(
517 '{') and not tokenizer.LookingAt('<'):
518 _SkipFieldValue(tokenizer)
519 else:
520 _SkipFieldMessage(tokenizer)
521
522
523def _SkipField(tokenizer):
524 """Skips over a complete field (name and value/message).
525
526 Args:
527 tokenizer: A tokenizer to parse the field name and values.
528 """
529 if tokenizer.TryConsume('['):
530 # Consume extension name.
531 tokenizer.ConsumeIdentifier()
532 while tokenizer.TryConsume('.'):
533 tokenizer.ConsumeIdentifier()
534 tokenizer.Consume(']')
535 else:
536 tokenizer.ConsumeIdentifier()
537
538 _SkipFieldContents(tokenizer)
539
540 # For historical reasons, fields may optionally be separated by commas or
541 # semicolons.
542 if not tokenizer.TryConsume(','):
543 tokenizer.TryConsume(';')
544
545
546def _SkipFieldMessage(tokenizer):
547 """Skips over a field message.
548
549 Args:
550 tokenizer: A tokenizer to parse the field name and values.
551 """
552
553 if tokenizer.TryConsume('<'):
554 delimiter = '>'
555 else:
556 tokenizer.Consume('{')
557 delimiter = '}'
558
559 while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'):
560 _SkipField(tokenizer)
561
562 tokenizer.Consume(delimiter)
563
564
565def _SkipFieldValue(tokenizer):
566 """Skips over a field value.
567
568 Args:
569 tokenizer: A tokenizer to parse the field name and values.
570
571 Raises:
572 ParseError: In case an invalid field value is found.
573 """
574 # String tokens can come in multiple adjacent string literals.
575 # If we can consume one, consume as many as we can.
576 if tokenizer.TryConsumeString():
577 while tokenizer.TryConsumeString():
578 pass
579 return
580
581 if (not tokenizer.TryConsumeIdentifier() and
582 not tokenizer.TryConsumeInt64() and
583 not tokenizer.TryConsumeUint64() and
584 not tokenizer.TryConsumeFloat()):
585 raise ParseError('Invalid field value: ' + tokenizer.token)
586
587
588def _MergeScalarField(tokenizer, message, field, allow_multiple_scalars):
589 """Merges a single protocol message scalar field into a message.
590
591 Args:
592 tokenizer: A tokenizer to parse the field value.
593 message: A protocol message to record the data.
594 field: The descriptor of the field to be merged.
595 allow_multiple_scalars: Determines if repeated values for a non-repeated
596 field are permitted, e.g., the string "foo: 1 foo: 2" for a
597 required/optional field named "foo".
598
599 Raises:
600 ParseError: In case of text parsing problems.
601 RuntimeError: On runtime errors.
602 """
603 value = None
604
605 if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
606 descriptor.FieldDescriptor.TYPE_SINT32,
607 descriptor.FieldDescriptor.TYPE_SFIXED32):
608 value = tokenizer.ConsumeInt32()
609 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
610 descriptor.FieldDescriptor.TYPE_SINT64,
611 descriptor.FieldDescriptor.TYPE_SFIXED64):
612 value = tokenizer.ConsumeInt64()
613 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
614 descriptor.FieldDescriptor.TYPE_FIXED32):
615 value = tokenizer.ConsumeUint32()
616 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
617 descriptor.FieldDescriptor.TYPE_FIXED64):
618 value = tokenizer.ConsumeUint64()
619 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
620 descriptor.FieldDescriptor.TYPE_DOUBLE):
621 value = tokenizer.ConsumeFloat()
622 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
623 value = tokenizer.ConsumeBool()
624 elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
625 value = tokenizer.ConsumeString()
626 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
627 value = tokenizer.ConsumeByteString()
628 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
629 value = tokenizer.ConsumeEnum(field)
630 else:
631 raise RuntimeError('Unknown field type %d' % field.type)
632
633 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
634 if field.is_extension:
635 message.Extensions[field].append(value)
636 else:
637 getattr(message, field.name).append(value)
638 else:
639 if field.is_extension:
640 if not allow_multiple_scalars and message.HasExtension(field):
641 raise tokenizer.ParseErrorPreviousToken(
642 'Message type "%s" should not have multiple "%s" extensions.' %
643 (message.DESCRIPTOR.full_name, field.full_name))
644 else:
645 message.Extensions[field] = value
646 else:
647 if not allow_multiple_scalars and message.HasField(field.name):
648 raise tokenizer.ParseErrorPreviousToken(
649 'Message type "%s" should not have multiple "%s" fields.' %
650 (message.DESCRIPTOR.full_name, field.name))
651 else:
652 setattr(message, field.name, value)
653
654
655class _Tokenizer(object):
656 """Protocol buffer text representation tokenizer.
657
658 This class handles the lower level string parsing by splitting it into
659 meaningful tokens.
660
661 It was directly ported from the Java protocol buffer API.
662 """
663
664 _WHITESPACE = re.compile('(\\s|(#.*$))+', re.MULTILINE)
665 _TOKEN = re.compile('|'.join([
666 r'[a-zA-Z_][0-9a-zA-Z_+-]*', # an identifier
667 r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*', # a number
668 ] + [ # quoted str for each quote mark
669 r'{qt}([^{qt}\n\\]|\\.)*({qt}|\\?$)'.format(qt=mark) for mark in _QUOTES
670 ]))
671
672 _IDENTIFIER = re.compile(r'\w+')
673
674 def __init__(self, lines):
675 self._position = 0
676 self._line = -1
677 self._column = 0
678 self._token_start = None
679 self.token = ''
680 self._lines = iter(lines)
681 self._current_line = ''
682 self._previous_line = 0
683 self._previous_column = 0
684 self._more_lines = True
685 self._SkipWhitespace()
686 self.NextToken()
687
688 def LookingAt(self, token):
689 return self.token == token
690
691 def AtEnd(self):
692 """Checks the end of the text was reached.
693
694 Returns:
695 True iff the end was reached.
696 """
697 return not self.token
698
699 def _PopLine(self):
700 while len(self._current_line) <= self._column:
701 try:
702 self._current_line = next(self._lines)
703 except StopIteration:
704 self._current_line = ''
705 self._more_lines = False
706 return
707 else:
708 self._line += 1
709 self._column = 0
710
711 def _SkipWhitespace(self):
712 while True:
713 self._PopLine()
714 match = self._WHITESPACE.match(self._current_line, self._column)
715 if not match:
716 break
717 length = len(match.group(0))
718 self._column += length
719
720 def TryConsume(self, token):
721 """Tries to consume a given piece of text.
722
723 Args:
724 token: Text to consume.
725
726 Returns:
727 True iff the text was consumed.
728 """
729 if self.token == token:
730 self.NextToken()
731 return True
732 return False
733
734 def Consume(self, token):
735 """Consumes a piece of text.
736
737 Args:
738 token: Text to consume.
739
740 Raises:
741 ParseError: If the text couldn't be consumed.
742 """
743 if not self.TryConsume(token):
744 raise self._ParseError('Expected "%s".' % token)
745
746 def TryConsumeIdentifier(self):
747 try:
748 self.ConsumeIdentifier()
749 return True
750 except ParseError:
751 return False
752
753 def ConsumeIdentifier(self):
754 """Consumes protocol message field identifier.
755
756 Returns:
757 Identifier string.
758
759 Raises:
760 ParseError: If an identifier couldn't be consumed.
761 """
762 result = self.token
763 if not self._IDENTIFIER.match(result):
764 raise self._ParseError('Expected identifier.')
765 self.NextToken()
766 return result
767
768 def ConsumeInt32(self):
769 """Consumes a signed 32bit integer number.
770
771 Returns:
772 The integer parsed.
773
774 Raises:
775 ParseError: If a signed 32bit integer couldn't be consumed.
776 """
777 try:
778 result = ParseInteger(self.token, is_signed=True, is_long=False)
779 except ValueError as e:
780 raise self._ParseError(str(e))
781 self.NextToken()
782 return result
783
784 def ConsumeUint32(self):
785 """Consumes an unsigned 32bit integer number.
786
787 Returns:
788 The integer parsed.
789
790 Raises:
791 ParseError: If an unsigned 32bit integer couldn't be consumed.
792 """
793 try:
794 result = ParseInteger(self.token, is_signed=False, is_long=False)
795 except ValueError as e:
796 raise self._ParseError(str(e))
797 self.NextToken()
798 return result
799
800 def TryConsumeInt64(self):
801 try:
802 self.ConsumeInt64()
803 return True
804 except ParseError:
805 return False
806
807 def ConsumeInt64(self):
808 """Consumes a signed 64bit integer number.
809
810 Returns:
811 The integer parsed.
812
813 Raises:
814 ParseError: If a signed 64bit integer couldn't be consumed.
815 """
816 try:
817 result = ParseInteger(self.token, is_signed=True, is_long=True)
818 except ValueError as e:
819 raise self._ParseError(str(e))
820 self.NextToken()
821 return result
822
823 def TryConsumeUint64(self):
824 try:
825 self.ConsumeUint64()
826 return True
827 except ParseError:
828 return False
829
830 def ConsumeUint64(self):
831 """Consumes an unsigned 64bit integer number.
832
833 Returns:
834 The integer parsed.
835
836 Raises:
837 ParseError: If an unsigned 64bit integer couldn't be consumed.
838 """
839 try:
840 result = ParseInteger(self.token, is_signed=False, is_long=True)
841 except ValueError as e:
842 raise self._ParseError(str(e))
843 self.NextToken()
844 return result
845
846 def TryConsumeFloat(self):
847 try:
848 self.ConsumeFloat()
849 return True
850 except ParseError:
851 return False
852
853 def ConsumeFloat(self):
854 """Consumes an floating point number.
855
856 Returns:
857 The number parsed.
858
859 Raises:
860 ParseError: If a floating point number couldn't be consumed.
861 """
862 try:
863 result = ParseFloat(self.token)
864 except ValueError as e:
865 raise self._ParseError(str(e))
866 self.NextToken()
867 return result
868
869 def ConsumeBool(self):
870 """Consumes a boolean value.
871
872 Returns:
873 The bool parsed.
874
875 Raises:
876 ParseError: If a boolean value couldn't be consumed.
877 """
878 try:
879 result = ParseBool(self.token)
880 except ValueError as e:
881 raise self._ParseError(str(e))
882 self.NextToken()
883 return result
884
885 def TryConsumeString(self):
886 try:
887 self.ConsumeString()
888 return True
889 except ParseError:
890 return False
891
892 def ConsumeString(self):
893 """Consumes a string value.
894
895 Returns:
896 The string parsed.
897
898 Raises:
899 ParseError: If a string value couldn't be consumed.
900 """
901 the_bytes = self.ConsumeByteString()
902 try:
903 return six.text_type(the_bytes, 'utf-8')
904 except UnicodeDecodeError as e:
905 raise self._StringParseError(e)
906
907 def ConsumeByteString(self):
908 """Consumes a byte array value.
909
910 Returns:
911 The array parsed (as a string).
912
913 Raises:
914 ParseError: If a byte array value couldn't be consumed.
915 """
916 the_list = [self._ConsumeSingleByteString()]
917 while self.token and self.token[0] in _QUOTES:
918 the_list.append(self._ConsumeSingleByteString())
919 return b''.join(the_list)
920
921 def _ConsumeSingleByteString(self):
922 """Consume one token of a string literal.
923
924 String literals (whether bytes or text) can come in multiple adjacent
925 tokens which are automatically concatenated, like in C or Python. This
926 method only consumes one token.
927
928 Returns:
929 The token parsed.
930 Raises:
931 ParseError: When the wrong format data is found.
932 """
933 text = self.token
934 if len(text) < 1 or text[0] not in _QUOTES:
935 raise self._ParseError('Expected string but found: %r' % (text,))
936
937 if len(text) < 2 or text[-1] != text[0]:
938 raise self._ParseError('String missing ending quote: %r' % (text,))
939
940 try:
941 result = text_encoding.CUnescape(text[1:-1])
942 except ValueError as e:
943 raise self._ParseError(str(e))
944 self.NextToken()
945 return result
946
947 def ConsumeEnum(self, field):
948 try:
949 result = ParseEnum(field, self.token)
950 except ValueError as e:
951 raise self._ParseError(str(e))
952 self.NextToken()
953 return result
954
955 def ParseErrorPreviousToken(self, message):
956 """Creates and *returns* a ParseError for the previously read token.
957
958 Args:
959 message: A message to set for the exception.
960
961 Returns:
962 A ParseError instance.
963 """
964 return ParseError('%d:%d : %s' % (
965 self._previous_line + 1, self._previous_column + 1, message))
966
967 def _ParseError(self, message):
968 """Creates and *returns* a ParseError for the current token."""
969 return ParseError('%d:%d : %s' % (
970 self._line + 1, self._column + 1, message))
971
972 def _StringParseError(self, e):
973 return self._ParseError('Couldn\'t parse string: ' + str(e))
974
975 def NextToken(self):
976 """Reads the next meaningful token."""
977 self._previous_line = self._line
978 self._previous_column = self._column
979
980 self._column += len(self.token)
981 self._SkipWhitespace()
982
983 if not self._more_lines:
984 self.token = ''
985 return
986
987 match = self._TOKEN.match(self._current_line, self._column)
988 if match:
989 token = match.group(0)
990 self.token = token
991 else:
992 self.token = self._current_line[self._column]
993
994
995def ParseInteger(text, is_signed=False, is_long=False):
996 """Parses an integer.
997
998 Args:
999 text: The text to parse.
1000 is_signed: True if a signed integer must be parsed.
1001 is_long: True if a long integer must be parsed.
1002
1003 Returns:
1004 The integer value.
1005
1006 Raises:
1007 ValueError: Thrown Iff the text is not a valid integer.
1008 """
1009 # Do the actual parsing. Exception handling is propagated to caller.
1010 try:
1011 # We force 32-bit values to int and 64-bit values to long to make
1012 # alternate implementations where the distinction is more significant
1013 # (e.g. the C++ implementation) simpler.
1014 if is_long:
1015 result = long(text, 0)
1016 else:
1017 result = int(text, 0)
1018 except ValueError:
1019 raise ValueError('Couldn\'t parse integer: %s' % text)
1020
1021 # Check if the integer is sane. Exceptions handled by callers.
1022 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
1023 checker.CheckValue(result)
1024 return result
1025
1026
1027def ParseFloat(text):
1028 """Parse a floating point number.
1029
1030 Args:
1031 text: Text to parse.
1032
1033 Returns:
1034 The number parsed.
1035
1036 Raises:
1037 ValueError: If a floating point number couldn't be parsed.
1038 """
1039 try:
1040 # Assume Python compatible syntax.
1041 return float(text)
1042 except ValueError:
1043 # Check alternative spellings.
1044 if _FLOAT_INFINITY.match(text):
1045 if text[0] == '-':
1046 return float('-inf')
1047 else:
1048 return float('inf')
1049 elif _FLOAT_NAN.match(text):
1050 return float('nan')
1051 else:
1052 # assume '1.0f' format
1053 try:
1054 return float(text.rstrip('f'))
1055 except ValueError:
1056 raise ValueError('Couldn\'t parse float: %s' % text)
1057
1058
1059def ParseBool(text):
1060 """Parse a boolean value.
1061
1062 Args:
1063 text: Text to parse.
1064
1065 Returns:
1066 Boolean values parsed
1067
1068 Raises:
1069 ValueError: If text is not a valid boolean.
1070 """
1071 if text in ('true', 't', '1'):
1072 return True
1073 elif text in ('false', 'f', '0'):
1074 return False
1075 else:
1076 raise ValueError('Expected "true" or "false".')
1077
1078
1079def ParseEnum(field, value):
1080 """Parse an enum value.
1081
1082 The value can be specified by a number (the enum value), or by
1083 a string literal (the enum name).
1084
1085 Args:
1086 field: Enum field descriptor.
1087 value: String value.
1088
1089 Returns:
1090 Enum value number.
1091
1092 Raises:
1093 ValueError: If the enum value could not be parsed.
1094 """
1095 enum_descriptor = field.enum_type
1096 try:
1097 number = int(value, 0)
1098 except ValueError:
1099 # Identifier.
1100 enum_value = enum_descriptor.values_by_name.get(value, None)
1101 if enum_value is None:
1102 raise ValueError(
1103 'Enum type "%s" has no value named %s.' % (
1104 enum_descriptor.full_name, value))
1105 else:
1106 # Numeric value.
1107 enum_value = enum_descriptor.values_by_number.get(number, None)
1108 if enum_value is None:
1109 raise ValueError(
1110 'Enum type "%s" has no value with number %d.' % (
1111 enum_descriptor.full_name, number))
1112 return enum_value.number