Brian Silverman | 9c614bc | 2016-02-15 20:20:02 -0500 | [diff] [blame] | 1 | #region Copyright notice and license |
| 2 | // Protocol Buffers - Google's data interchange format |
| 3 | // Copyright 2008 Google Inc. All rights reserved. |
| 4 | // https://developers.google.com/protocol-buffers/ |
| 5 | // |
| 6 | // Redistribution and use in source and binary forms, with or without |
| 7 | // modification, are permitted provided that the following conditions are |
| 8 | // met: |
| 9 | // |
| 10 | // * Redistributions of source code must retain the above copyright |
| 11 | // notice, this list of conditions and the following disclaimer. |
| 12 | // * Redistributions in binary form must reproduce the above |
| 13 | // copyright notice, this list of conditions and the following disclaimer |
| 14 | // in the documentation and/or other materials provided with the |
| 15 | // distribution. |
| 16 | // * Neither the name of Google Inc. nor the names of its |
| 17 | // contributors may be used to endorse or promote products derived from |
| 18 | // this software without specific prior written permission. |
| 19 | // |
| 20 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 21 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 22 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 23 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 24 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 25 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 26 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 27 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 28 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 29 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 30 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 31 | #endregion |
| 32 | using System; |
| 33 | using System.Collections.Generic; |
| 34 | using System.Globalization; |
| 35 | using System.IO; |
| 36 | using System.Text; |
| 37 | |
| 38 | namespace Google.Protobuf |
| 39 | { |
| 40 | /// <summary> |
| 41 | /// Simple but strict JSON tokenizer, rigidly following RFC 7159. |
| 42 | /// </summary> |
| 43 | /// <remarks> |
| 44 | /// <para> |
| 45 | /// This tokenizer is stateful, and only returns "useful" tokens - names, values etc. |
| 46 | /// It does not create tokens for the separator between names and values, or for the comma |
| 47 | /// between values. It validates the token stream as it goes - so callers can assume that the |
| 48 | /// tokens it produces are appropriate. For example, it would never produce "start object, end array." |
| 49 | /// </para> |
| 50 | /// <para>Implementation details: the base class handles single token push-back and </para> |
| 51 | /// <para>Not thread-safe.</para> |
| 52 | /// </remarks> |
| 53 | internal abstract class JsonTokenizer |
| 54 | { |
| 55 | private JsonToken bufferedToken; |
| 56 | |
| 57 | /// <summary> |
| 58 | /// Creates a tokenizer that reads from the given text reader. |
| 59 | /// </summary> |
| 60 | internal static JsonTokenizer FromTextReader(TextReader reader) |
| 61 | { |
| 62 | return new JsonTextTokenizer(reader); |
| 63 | } |
| 64 | |
| 65 | /// <summary> |
| 66 | /// Creates a tokenizer that first replays the given list of tokens, then continues reading |
| 67 | /// from another tokenizer. Note that if the returned tokenizer is "pushed back", that does not push back |
| 68 | /// on the continuation tokenizer, or vice versa. Care should be taken when using this method - it was |
| 69 | /// created for the sake of Any parsing. |
| 70 | /// </summary> |
| 71 | internal static JsonTokenizer FromReplayedTokens(IList<JsonToken> tokens, JsonTokenizer continuation) |
| 72 | { |
| 73 | return new JsonReplayTokenizer(tokens, continuation); |
| 74 | } |
| 75 | |
| 76 | /// <summary> |
| 77 | /// Returns the depth of the stack, purely in objects (not collections). |
| 78 | /// Informally, this is the number of remaining unclosed '{' characters we have. |
| 79 | /// </summary> |
| 80 | internal int ObjectDepth { get; private set; } |
| 81 | |
| 82 | // TODO: Why do we allow a different token to be pushed back? It might be better to always remember the previous |
| 83 | // token returned, and allow a parameterless Rewind() method (which could only be called once, just like the current PushBack). |
| 84 | internal void PushBack(JsonToken token) |
| 85 | { |
| 86 | if (bufferedToken != null) |
| 87 | { |
| 88 | throw new InvalidOperationException("Can't push back twice"); |
| 89 | } |
| 90 | bufferedToken = token; |
| 91 | if (token.Type == JsonToken.TokenType.StartObject) |
| 92 | { |
| 93 | ObjectDepth--; |
| 94 | } |
| 95 | else if (token.Type == JsonToken.TokenType.EndObject) |
| 96 | { |
| 97 | ObjectDepth++; |
| 98 | } |
| 99 | } |
| 100 | |
| 101 | /// <summary> |
| 102 | /// Returns the next JSON token in the stream. An EndDocument token is returned to indicate the end of the stream, |
| 103 | /// after which point <c>Next()</c> should not be called again. |
| 104 | /// </summary> |
| 105 | /// <remarks>This implementation provides single-token buffering, and calls <see cref="NextImpl"/> if there is no buffered token.</remarks> |
| 106 | /// <returns>The next token in the stream. This is never null.</returns> |
| 107 | /// <exception cref="InvalidOperationException">This method is called after an EndDocument token has been returned</exception> |
| 108 | /// <exception cref="InvalidJsonException">The input text does not comply with RFC 7159</exception> |
| 109 | internal JsonToken Next() |
| 110 | { |
| 111 | JsonToken tokenToReturn; |
| 112 | if (bufferedToken != null) |
| 113 | { |
| 114 | tokenToReturn = bufferedToken; |
| 115 | bufferedToken = null; |
| 116 | } |
| 117 | else |
| 118 | { |
| 119 | tokenToReturn = NextImpl(); |
| 120 | } |
| 121 | if (tokenToReturn.Type == JsonToken.TokenType.StartObject) |
| 122 | { |
| 123 | ObjectDepth++; |
| 124 | } |
| 125 | else if (tokenToReturn.Type == JsonToken.TokenType.EndObject) |
| 126 | { |
| 127 | ObjectDepth--; |
| 128 | } |
| 129 | return tokenToReturn; |
| 130 | } |
| 131 | |
| 132 | /// <summary> |
| 133 | /// Returns the next JSON token in the stream, when requested by the base class. (The <see cref="Next"/> method delegates |
| 134 | /// to this if it doesn't have a buffered token.) |
| 135 | /// </summary> |
| 136 | /// <exception cref="InvalidOperationException">This method is called after an EndDocument token has been returned</exception> |
| 137 | /// <exception cref="InvalidJsonException">The input text does not comply with RFC 7159</exception> |
| 138 | protected abstract JsonToken NextImpl(); |
| 139 | |
| 140 | /// <summary> |
Austin Schuh | 40c1652 | 2018-10-28 20:27:54 -0700 | [diff] [blame^] | 141 | /// Skips the value we're about to read. This must only be called immediately after reading a property name. |
| 142 | /// If the value is an object or an array, the complete object/array is skipped. |
| 143 | /// </summary> |
| 144 | internal void SkipValue() |
| 145 | { |
| 146 | // We'll assume that Next() makes sure that the end objects and end arrays are all valid. |
| 147 | // All we care about is the total nesting depth we need to close. |
| 148 | int depth = 0; |
| 149 | |
| 150 | // do/while rather than while loop so that we read at least one token. |
| 151 | do |
| 152 | { |
| 153 | var token = Next(); |
| 154 | switch (token.Type) |
| 155 | { |
| 156 | case JsonToken.TokenType.EndArray: |
| 157 | case JsonToken.TokenType.EndObject: |
| 158 | depth--; |
| 159 | break; |
| 160 | case JsonToken.TokenType.StartArray: |
| 161 | case JsonToken.TokenType.StartObject: |
| 162 | depth++; |
| 163 | break; |
| 164 | } |
| 165 | } while (depth != 0); |
| 166 | } |
| 167 | |
| 168 | /// <summary> |
Brian Silverman | 9c614bc | 2016-02-15 20:20:02 -0500 | [diff] [blame] | 169 | /// Tokenizer which first exhausts a list of tokens, then consults another tokenizer. |
| 170 | /// </summary> |
| 171 | private class JsonReplayTokenizer : JsonTokenizer |
| 172 | { |
| 173 | private readonly IList<JsonToken> tokens; |
| 174 | private readonly JsonTokenizer nextTokenizer; |
| 175 | private int nextTokenIndex; |
| 176 | |
| 177 | internal JsonReplayTokenizer(IList<JsonToken> tokens, JsonTokenizer nextTokenizer) |
| 178 | { |
| 179 | this.tokens = tokens; |
| 180 | this.nextTokenizer = nextTokenizer; |
| 181 | } |
| 182 | |
| 183 | // FIXME: Object depth not maintained... |
| 184 | protected override JsonToken NextImpl() |
| 185 | { |
| 186 | if (nextTokenIndex >= tokens.Count) |
| 187 | { |
| 188 | return nextTokenizer.Next(); |
| 189 | } |
| 190 | return tokens[nextTokenIndex++]; |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | /// <summary> |
| 195 | /// Tokenizer which does all the *real* work of parsing JSON. |
| 196 | /// </summary> |
| 197 | private sealed class JsonTextTokenizer : JsonTokenizer |
| 198 | { |
| 199 | // The set of states in which a value is valid next token. |
| 200 | private static readonly State ValueStates = State.ArrayStart | State.ArrayAfterComma | State.ObjectAfterColon | State.StartOfDocument; |
| 201 | |
| 202 | private readonly Stack<ContainerType> containerStack = new Stack<ContainerType>(); |
| 203 | private readonly PushBackReader reader; |
| 204 | private State state; |
| 205 | |
| 206 | internal JsonTextTokenizer(TextReader reader) |
| 207 | { |
| 208 | this.reader = new PushBackReader(reader); |
| 209 | state = State.StartOfDocument; |
| 210 | containerStack.Push(ContainerType.Document); |
| 211 | } |
| 212 | |
| 213 | /// <remarks> |
| 214 | /// This method essentially just loops through characters skipping whitespace, validating and |
| 215 | /// changing state (e.g. from ObjectBeforeColon to ObjectAfterColon) |
| 216 | /// until it reaches something which will be a genuine token (e.g. a start object, or a value) at which point |
| 217 | /// it returns the token. Although the method is large, it would be relatively hard to break down further... most |
| 218 | /// of it is the large switch statement, which sometimes returns and sometimes doesn't. |
| 219 | /// </remarks> |
| 220 | protected override JsonToken NextImpl() |
| 221 | { |
| 222 | if (state == State.ReaderExhausted) |
| 223 | { |
| 224 | throw new InvalidOperationException("Next() called after end of document"); |
| 225 | } |
| 226 | while (true) |
| 227 | { |
| 228 | var next = reader.Read(); |
| 229 | if (next == null) |
| 230 | { |
| 231 | ValidateState(State.ExpectedEndOfDocument, "Unexpected end of document in state: "); |
| 232 | state = State.ReaderExhausted; |
| 233 | return JsonToken.EndDocument; |
| 234 | } |
| 235 | switch (next.Value) |
| 236 | { |
| 237 | // Skip whitespace between tokens |
| 238 | case ' ': |
| 239 | case '\t': |
| 240 | case '\r': |
| 241 | case '\n': |
| 242 | break; |
| 243 | case ':': |
| 244 | ValidateState(State.ObjectBeforeColon, "Invalid state to read a colon: "); |
| 245 | state = State.ObjectAfterColon; |
| 246 | break; |
| 247 | case ',': |
Austin Schuh | 40c1652 | 2018-10-28 20:27:54 -0700 | [diff] [blame^] | 248 | ValidateState(State.ObjectAfterProperty | State.ArrayAfterValue, "Invalid state to read a comma: "); |
Brian Silverman | 9c614bc | 2016-02-15 20:20:02 -0500 | [diff] [blame] | 249 | state = state == State.ObjectAfterProperty ? State.ObjectAfterComma : State.ArrayAfterComma; |
| 250 | break; |
| 251 | case '"': |
| 252 | string stringValue = ReadString(); |
| 253 | if ((state & (State.ObjectStart | State.ObjectAfterComma)) != 0) |
| 254 | { |
| 255 | state = State.ObjectBeforeColon; |
| 256 | return JsonToken.Name(stringValue); |
| 257 | } |
| 258 | else |
| 259 | { |
| 260 | ValidateAndModifyStateForValue("Invalid state to read a double quote: "); |
| 261 | return JsonToken.Value(stringValue); |
| 262 | } |
| 263 | case '{': |
| 264 | ValidateState(ValueStates, "Invalid state to read an open brace: "); |
| 265 | state = State.ObjectStart; |
| 266 | containerStack.Push(ContainerType.Object); |
| 267 | return JsonToken.StartObject; |
| 268 | case '}': |
| 269 | ValidateState(State.ObjectAfterProperty | State.ObjectStart, "Invalid state to read a close brace: "); |
| 270 | PopContainer(); |
| 271 | return JsonToken.EndObject; |
| 272 | case '[': |
| 273 | ValidateState(ValueStates, "Invalid state to read an open square bracket: "); |
| 274 | state = State.ArrayStart; |
| 275 | containerStack.Push(ContainerType.Array); |
| 276 | return JsonToken.StartArray; |
| 277 | case ']': |
| 278 | ValidateState(State.ArrayAfterValue | State.ArrayStart, "Invalid state to read a close square bracket: "); |
| 279 | PopContainer(); |
| 280 | return JsonToken.EndArray; |
| 281 | case 'n': // Start of null |
| 282 | ConsumeLiteral("null"); |
| 283 | ValidateAndModifyStateForValue("Invalid state to read a null literal: "); |
| 284 | return JsonToken.Null; |
| 285 | case 't': // Start of true |
| 286 | ConsumeLiteral("true"); |
| 287 | ValidateAndModifyStateForValue("Invalid state to read a true literal: "); |
| 288 | return JsonToken.True; |
| 289 | case 'f': // Start of false |
| 290 | ConsumeLiteral("false"); |
| 291 | ValidateAndModifyStateForValue("Invalid state to read a false literal: "); |
| 292 | return JsonToken.False; |
| 293 | case '-': // Start of a number |
| 294 | case '0': |
| 295 | case '1': |
| 296 | case '2': |
| 297 | case '3': |
| 298 | case '4': |
| 299 | case '5': |
| 300 | case '6': |
| 301 | case '7': |
| 302 | case '8': |
| 303 | case '9': |
| 304 | double number = ReadNumber(next.Value); |
| 305 | ValidateAndModifyStateForValue("Invalid state to read a number token: "); |
| 306 | return JsonToken.Value(number); |
| 307 | default: |
| 308 | throw new InvalidJsonException("Invalid first character of token: " + next.Value); |
| 309 | } |
| 310 | } |
| 311 | } |
| 312 | |
| 313 | private void ValidateState(State validStates, string errorPrefix) |
| 314 | { |
| 315 | if ((validStates & state) == 0) |
| 316 | { |
| 317 | throw reader.CreateException(errorPrefix + state); |
| 318 | } |
| 319 | } |
| 320 | |
| 321 | /// <summary> |
| 322 | /// Reads a string token. It is assumed that the opening " has already been read. |
| 323 | /// </summary> |
| 324 | private string ReadString() |
| 325 | { |
| 326 | var value = new StringBuilder(); |
| 327 | bool haveHighSurrogate = false; |
| 328 | while (true) |
| 329 | { |
| 330 | char c = reader.ReadOrFail("Unexpected end of text while reading string"); |
| 331 | if (c < ' ') |
| 332 | { |
| 333 | throw reader.CreateException(string.Format(CultureInfo.InvariantCulture, "Invalid character in string literal: U+{0:x4}", (int) c)); |
| 334 | } |
| 335 | if (c == '"') |
| 336 | { |
| 337 | if (haveHighSurrogate) |
| 338 | { |
| 339 | throw reader.CreateException("Invalid use of surrogate pair code units"); |
| 340 | } |
| 341 | return value.ToString(); |
| 342 | } |
| 343 | if (c == '\\') |
| 344 | { |
| 345 | c = ReadEscapedCharacter(); |
| 346 | } |
| 347 | // TODO: Consider only allowing surrogate pairs that are either both escaped, |
| 348 | // or both not escaped. It would be a very odd text stream that contained a "lone" high surrogate |
| 349 | // followed by an escaped low surrogate or vice versa... and that couldn't even be represented in UTF-8. |
| 350 | if (haveHighSurrogate != char.IsLowSurrogate(c)) |
| 351 | { |
| 352 | throw reader.CreateException("Invalid use of surrogate pair code units"); |
| 353 | } |
| 354 | haveHighSurrogate = char.IsHighSurrogate(c); |
| 355 | value.Append(c); |
| 356 | } |
| 357 | } |
| 358 | |
| 359 | /// <summary> |
| 360 | /// Reads an escaped character. It is assumed that the leading backslash has already been read. |
| 361 | /// </summary> |
| 362 | private char ReadEscapedCharacter() |
| 363 | { |
| 364 | char c = reader.ReadOrFail("Unexpected end of text while reading character escape sequence"); |
| 365 | switch (c) |
| 366 | { |
| 367 | case 'n': |
| 368 | return '\n'; |
| 369 | case '\\': |
| 370 | return '\\'; |
| 371 | case 'b': |
| 372 | return '\b'; |
| 373 | case 'f': |
| 374 | return '\f'; |
| 375 | case 'r': |
| 376 | return '\r'; |
| 377 | case 't': |
| 378 | return '\t'; |
| 379 | case '"': |
| 380 | return '"'; |
| 381 | case '/': |
| 382 | return '/'; |
| 383 | case 'u': |
| 384 | return ReadUnicodeEscape(); |
| 385 | default: |
| 386 | throw reader.CreateException(string.Format(CultureInfo.InvariantCulture, "Invalid character in character escape sequence: U+{0:x4}", (int) c)); |
| 387 | } |
| 388 | } |
| 389 | |
| 390 | /// <summary> |
| 391 | /// Reads an escaped Unicode 4-nybble hex sequence. It is assumed that the leading \u has already been read. |
| 392 | /// </summary> |
| 393 | private char ReadUnicodeEscape() |
| 394 | { |
| 395 | int result = 0; |
| 396 | for (int i = 0; i < 4; i++) |
| 397 | { |
| 398 | char c = reader.ReadOrFail("Unexpected end of text while reading Unicode escape sequence"); |
| 399 | int nybble; |
| 400 | if (c >= '0' && c <= '9') |
| 401 | { |
| 402 | nybble = c - '0'; |
| 403 | } |
| 404 | else if (c >= 'a' && c <= 'f') |
| 405 | { |
| 406 | nybble = c - 'a' + 10; |
| 407 | } |
| 408 | else if (c >= 'A' && c <= 'F') |
| 409 | { |
| 410 | nybble = c - 'A' + 10; |
| 411 | } |
| 412 | else |
| 413 | { |
| 414 | throw reader.CreateException(string.Format(CultureInfo.InvariantCulture, "Invalid character in character escape sequence: U+{0:x4}", (int) c)); |
| 415 | } |
| 416 | result = (result << 4) + nybble; |
| 417 | } |
| 418 | return (char) result; |
| 419 | } |
| 420 | |
| 421 | /// <summary> |
| 422 | /// Consumes a text-only literal, throwing an exception if the read text doesn't match it. |
| 423 | /// It is assumed that the first letter of the literal has already been read. |
| 424 | /// </summary> |
| 425 | private void ConsumeLiteral(string text) |
| 426 | { |
| 427 | for (int i = 1; i < text.Length; i++) |
| 428 | { |
| 429 | char? next = reader.Read(); |
| 430 | if (next == null) |
| 431 | { |
| 432 | throw reader.CreateException("Unexpected end of text while reading literal token " + text); |
| 433 | } |
| 434 | if (next.Value != text[i]) |
| 435 | { |
| 436 | throw reader.CreateException("Unexpected character while reading literal token " + text); |
| 437 | } |
| 438 | } |
| 439 | } |
| 440 | |
| 441 | private double ReadNumber(char initialCharacter) |
| 442 | { |
| 443 | StringBuilder builder = new StringBuilder(); |
| 444 | if (initialCharacter == '-') |
| 445 | { |
| 446 | builder.Append("-"); |
| 447 | } |
| 448 | else |
| 449 | { |
| 450 | reader.PushBack(initialCharacter); |
| 451 | } |
| 452 | // Each method returns the character it read that doesn't belong in that part, |
| 453 | // so we know what to do next, including pushing the character back at the end. |
| 454 | // null is returned for "end of text". |
| 455 | char? next = ReadInt(builder); |
| 456 | if (next == '.') |
| 457 | { |
| 458 | next = ReadFrac(builder); |
| 459 | } |
| 460 | if (next == 'e' || next == 'E') |
| 461 | { |
| 462 | next = ReadExp(builder); |
| 463 | } |
| 464 | // If we read a character which wasn't part of the number, push it back so we can read it again |
| 465 | // to parse the next token. |
| 466 | if (next != null) |
| 467 | { |
| 468 | reader.PushBack(next.Value); |
| 469 | } |
| 470 | |
| 471 | // TODO: What exception should we throw if the value can't be represented as a double? |
| 472 | try |
| 473 | { |
| 474 | return double.Parse(builder.ToString(), |
| 475 | NumberStyles.AllowLeadingSign | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, |
| 476 | CultureInfo.InvariantCulture); |
| 477 | } |
| 478 | catch (OverflowException) |
| 479 | { |
| 480 | throw reader.CreateException("Numeric value out of range: " + builder); |
| 481 | } |
| 482 | } |
| 483 | |
| 484 | private char? ReadInt(StringBuilder builder) |
| 485 | { |
| 486 | char first = reader.ReadOrFail("Invalid numeric literal"); |
| 487 | if (first < '0' || first > '9') |
| 488 | { |
| 489 | throw reader.CreateException("Invalid numeric literal"); |
| 490 | } |
| 491 | builder.Append(first); |
| 492 | int digitCount; |
| 493 | char? next = ConsumeDigits(builder, out digitCount); |
| 494 | if (first == '0' && digitCount != 0) |
| 495 | { |
| 496 | throw reader.CreateException("Invalid numeric literal: leading 0 for non-zero value."); |
| 497 | } |
| 498 | return next; |
| 499 | } |
| 500 | |
| 501 | private char? ReadFrac(StringBuilder builder) |
| 502 | { |
| 503 | builder.Append('.'); // Already consumed this |
| 504 | int digitCount; |
| 505 | char? next = ConsumeDigits(builder, out digitCount); |
| 506 | if (digitCount == 0) |
| 507 | { |
| 508 | throw reader.CreateException("Invalid numeric literal: fraction with no trailing digits"); |
| 509 | } |
| 510 | return next; |
| 511 | } |
| 512 | |
| 513 | private char? ReadExp(StringBuilder builder) |
| 514 | { |
| 515 | builder.Append('E'); // Already consumed this (or 'e') |
| 516 | char? next = reader.Read(); |
| 517 | if (next == null) |
| 518 | { |
| 519 | throw reader.CreateException("Invalid numeric literal: exponent with no trailing digits"); |
| 520 | } |
| 521 | if (next == '-' || next == '+') |
| 522 | { |
| 523 | builder.Append(next.Value); |
| 524 | } |
| 525 | else |
| 526 | { |
| 527 | reader.PushBack(next.Value); |
| 528 | } |
| 529 | int digitCount; |
| 530 | next = ConsumeDigits(builder, out digitCount); |
| 531 | if (digitCount == 0) |
| 532 | { |
| 533 | throw reader.CreateException("Invalid numeric literal: exponent without value"); |
| 534 | } |
| 535 | return next; |
| 536 | } |
| 537 | |
| 538 | private char? ConsumeDigits(StringBuilder builder, out int count) |
| 539 | { |
| 540 | count = 0; |
| 541 | while (true) |
| 542 | { |
| 543 | char? next = reader.Read(); |
| 544 | if (next == null || next.Value < '0' || next.Value > '9') |
| 545 | { |
| 546 | return next; |
| 547 | } |
| 548 | count++; |
| 549 | builder.Append(next.Value); |
| 550 | } |
| 551 | } |
| 552 | |
| 553 | /// <summary> |
| 554 | /// Validates that we're in a valid state to read a value (using the given error prefix if necessary) |
| 555 | /// and changes the state to the appropriate one, e.g. ObjectAfterColon to ObjectAfterProperty. |
| 556 | /// </summary> |
| 557 | private void ValidateAndModifyStateForValue(string errorPrefix) |
| 558 | { |
| 559 | ValidateState(ValueStates, errorPrefix); |
| 560 | switch (state) |
| 561 | { |
| 562 | case State.StartOfDocument: |
| 563 | state = State.ExpectedEndOfDocument; |
| 564 | return; |
| 565 | case State.ObjectAfterColon: |
| 566 | state = State.ObjectAfterProperty; |
| 567 | return; |
| 568 | case State.ArrayStart: |
| 569 | case State.ArrayAfterComma: |
| 570 | state = State.ArrayAfterValue; |
| 571 | return; |
| 572 | default: |
| 573 | throw new InvalidOperationException("ValidateAndModifyStateForValue does not handle all value states (and should)"); |
| 574 | } |
| 575 | } |
| 576 | |
| 577 | /// <summary> |
| 578 | /// Pops the top-most container, and sets the state to the appropriate one for the end of a value |
| 579 | /// in the parent container. |
| 580 | /// </summary> |
| 581 | private void PopContainer() |
| 582 | { |
| 583 | containerStack.Pop(); |
| 584 | var parent = containerStack.Peek(); |
| 585 | switch (parent) |
| 586 | { |
| 587 | case ContainerType.Object: |
| 588 | state = State.ObjectAfterProperty; |
| 589 | break; |
| 590 | case ContainerType.Array: |
| 591 | state = State.ArrayAfterValue; |
| 592 | break; |
| 593 | case ContainerType.Document: |
| 594 | state = State.ExpectedEndOfDocument; |
| 595 | break; |
| 596 | default: |
| 597 | throw new InvalidOperationException("Unexpected container type: " + parent); |
| 598 | } |
| 599 | } |
| 600 | |
| 601 | private enum ContainerType |
| 602 | { |
| 603 | Document, Object, Array |
| 604 | } |
| 605 | |
| 606 | /// <summary> |
| 607 | /// Possible states of the tokenizer. |
| 608 | /// </summary> |
| 609 | /// <remarks> |
| 610 | /// <para>This is a flags enum purely so we can simply and efficiently represent a set of valid states |
| 611 | /// for checking.</para> |
| 612 | /// <para> |
| 613 | /// Each is documented with an example, |
| 614 | /// where ^ represents the current position within the text stream. The examples all use string values, |
| 615 | /// but could be any value, including nested objects/arrays. |
| 616 | /// The complete state of the tokenizer also includes a stack to indicate the contexts (arrays/objects). |
| 617 | /// Any additional notional state of "AfterValue" indicates that a value has been completed, at which |
| 618 | /// point there's an immediate transition to ExpectedEndOfDocument, ObjectAfterProperty or ArrayAfterValue. |
| 619 | /// </para> |
| 620 | /// <para> |
| 621 | /// These states were derived manually by reading RFC 7159 carefully. |
| 622 | /// </para> |
| 623 | /// </remarks> |
| 624 | [Flags] |
| 625 | private enum State |
| 626 | { |
| 627 | /// <summary> |
| 628 | /// ^ { "foo": "bar" } |
| 629 | /// Before the value in a document. Next states: ObjectStart, ArrayStart, "AfterValue" |
| 630 | /// </summary> |
| 631 | StartOfDocument = 1 << 0, |
| 632 | /// <summary> |
| 633 | /// { "foo": "bar" } ^ |
| 634 | /// After the value in a document. Next states: ReaderExhausted |
| 635 | /// </summary> |
| 636 | ExpectedEndOfDocument = 1 << 1, |
| 637 | /// <summary> |
| 638 | /// { "foo": "bar" } ^ (and already read to the end of the reader) |
| 639 | /// Terminal state. |
| 640 | /// </summary> |
| 641 | ReaderExhausted = 1 << 2, |
| 642 | /// <summary> |
| 643 | /// { ^ "foo": "bar" } |
| 644 | /// Before the *first* property in an object. |
| 645 | /// Next states: |
| 646 | /// "AfterValue" (empty object) |
| 647 | /// ObjectBeforeColon (read a name) |
| 648 | /// </summary> |
| 649 | ObjectStart = 1 << 3, |
| 650 | /// <summary> |
| 651 | /// { "foo" ^ : "bar", "x": "y" } |
| 652 | /// Next state: ObjectAfterColon |
| 653 | /// </summary> |
| 654 | ObjectBeforeColon = 1 << 4, |
| 655 | /// <summary> |
| 656 | /// { "foo" : ^ "bar", "x": "y" } |
| 657 | /// Before any property other than the first in an object. |
| 658 | /// (Equivalently: after any property in an object) |
| 659 | /// Next states: |
| 660 | /// "AfterValue" (value is simple) |
| 661 | /// ObjectStart (value is object) |
| 662 | /// ArrayStart (value is array) |
| 663 | /// </summary> |
| 664 | ObjectAfterColon = 1 << 5, |
| 665 | /// <summary> |
| 666 | /// { "foo" : "bar" ^ , "x" : "y" } |
| 667 | /// At the end of a property, so expecting either a comma or end-of-object |
| 668 | /// Next states: ObjectAfterComma or "AfterValue" |
| 669 | /// </summary> |
| 670 | ObjectAfterProperty = 1 << 6, |
| 671 | /// <summary> |
| 672 | /// { "foo":"bar", ^ "x":"y" } |
| 673 | /// Read the comma after the previous property, so expecting another property. |
| 674 | /// This is like ObjectStart, but closing brace isn't valid here |
| 675 | /// Next state: ObjectBeforeColon. |
| 676 | /// </summary> |
| 677 | ObjectAfterComma = 1 << 7, |
| 678 | /// <summary> |
| 679 | /// [ ^ "foo", "bar" ] |
| 680 | /// Before the *first* value in an array. |
| 681 | /// Next states: |
| 682 | /// "AfterValue" (read a value) |
| 683 | /// "AfterValue" (end of array; will pop stack) |
| 684 | /// </summary> |
| 685 | ArrayStart = 1 << 8, |
| 686 | /// <summary> |
| 687 | /// [ "foo" ^ , "bar" ] |
| 688 | /// After any value in an array, so expecting either a comma or end-of-array |
| 689 | /// Next states: ArrayAfterComma or "AfterValue" |
| 690 | /// </summary> |
| 691 | ArrayAfterValue = 1 << 9, |
| 692 | /// <summary> |
| 693 | /// [ "foo", ^ "bar" ] |
| 694 | /// After a comma in an array, so there *must* be another value (simple or complex). |
| 695 | /// Next states: "AfterValue" (simple value), StartObject, StartArray |
| 696 | /// </summary> |
| 697 | ArrayAfterComma = 1 << 10 |
| 698 | } |
| 699 | |
| 700 | /// <summary> |
| 701 | /// Wrapper around a text reader allowing small amounts of buffering and location handling. |
| 702 | /// </summary> |
| 703 | private class PushBackReader |
| 704 | { |
| 705 | // TODO: Add locations for errors etc. |
| 706 | |
| 707 | private readonly TextReader reader; |
| 708 | |
| 709 | internal PushBackReader(TextReader reader) |
| 710 | { |
| 711 | // TODO: Wrap the reader in a BufferedReader? |
| 712 | this.reader = reader; |
| 713 | } |
| 714 | |
| 715 | /// <summary> |
| 716 | /// The buffered next character, if we have one. |
| 717 | /// </summary> |
| 718 | private char? nextChar; |
| 719 | |
| 720 | /// <summary> |
| 721 | /// Returns the next character in the stream, or null if we have reached the end. |
| 722 | /// </summary> |
| 723 | /// <returns></returns> |
| 724 | internal char? Read() |
| 725 | { |
| 726 | if (nextChar != null) |
| 727 | { |
| 728 | char? tmp = nextChar; |
| 729 | nextChar = null; |
| 730 | return tmp; |
| 731 | } |
| 732 | int next = reader.Read(); |
| 733 | return next == -1 ? null : (char?) next; |
| 734 | } |
| 735 | |
| 736 | internal char ReadOrFail(string messageOnFailure) |
| 737 | { |
| 738 | char? next = Read(); |
| 739 | if (next == null) |
| 740 | { |
| 741 | throw CreateException(messageOnFailure); |
| 742 | } |
| 743 | return next.Value; |
| 744 | } |
| 745 | |
| 746 | internal void PushBack(char c) |
| 747 | { |
| 748 | if (nextChar != null) |
| 749 | { |
| 750 | throw new InvalidOperationException("Cannot push back when already buffering a character"); |
| 751 | } |
| 752 | nextChar = c; |
| 753 | } |
| 754 | |
| 755 | /// <summary> |
| 756 | /// Creates a new exception appropriate for the current state of the reader. |
| 757 | /// </summary> |
| 758 | internal InvalidJsonException CreateException(string message) |
| 759 | { |
| 760 | // TODO: Keep track of and use the location. |
| 761 | return new InvalidJsonException(message); |
| 762 | } |
| 763 | } |
| 764 | } |
| 765 | } |
| 766 | } |