Austin Schuh | d7e252d | 2019-10-06 13:51:02 -0700 | [diff] [blame] | 1 | #ifndef AOS_JSON_TOKENIZER_H_ |
| 2 | #define AOS_JSON_TOKENIZER_H_ |
| 3 | |
| 4 | #include <string> |
James Kuszmaul | 3ae4226 | 2019-11-08 12:33:41 -0800 | [diff] [blame] | 5 | #include <string_view> |
Austin Schuh | d7e252d | 2019-10-06 13:51:02 -0700 | [diff] [blame] | 6 | #include <vector> |
Philipp Schrader | 790cb54 | 2023-07-05 21:06:52 -0700 | [diff] [blame] | 7 | |
James Kuszmaul | 768c468 | 2023-10-12 21:07:16 -0700 | [diff] [blame] | 8 | #include "absl/strings/numbers.h" |
Pallavi Madhukar | e2eb281 | 2022-07-19 09:56:09 -0700 | [diff] [blame] | 9 | #include "flatbuffers/util.h" |
Austin Schuh | d7e252d | 2019-10-06 13:51:02 -0700 | [diff] [blame] | 10 | |
Austin Schuh | d7e252d | 2019-10-06 13:51:02 -0700 | [diff] [blame] | 11 | namespace aos { |
| 12 | |
| 13 | // This class implements the state machine at json.org |
| 14 | // |
| 15 | // The only modification is that it supports C++ comments /**/ in all |
| 16 | // whitespace. |
| 17 | class Tokenizer { |
| 18 | public: |
James Kuszmaul | 3ae4226 | 2019-11-08 12:33:41 -0800 | [diff] [blame] | 19 | Tokenizer(const std::string_view data) : data_(data) {} |
Austin Schuh | d7e252d | 2019-10-06 13:51:02 -0700 | [diff] [blame] | 20 | |
| 21 | enum class TokenType { |
| 22 | kEnd, |
| 23 | kError, |
| 24 | kStartObject, |
| 25 | kEndObject, |
| 26 | kStartArray, |
| 27 | kEndArray, |
| 28 | kField, |
| 29 | kNumberValue, |
| 30 | kStringValue, |
| 31 | kTrueValue, |
| 32 | kFalseValue, |
| 33 | }; |
| 34 | |
| 35 | // Returns the next token. |
| 36 | TokenType Next(); |
| 37 | |
| 38 | // Returns the last field_name and field_value. These are only valid when |
| 39 | // Next returns them. |
| 40 | const ::std::string &field_name() const { return field_name_; } |
| 41 | const ::std::string &field_value() const { return field_value_; } |
| 42 | |
| 43 | // Parses the current field value as a long long. Returns false if it failed |
| 44 | // to parse. |
James Kuszmaul | 768c468 | 2023-10-12 21:07:16 -0700 | [diff] [blame] | 45 | bool FieldAsInt(absl::int128 *value); |
Austin Schuh | d7e252d | 2019-10-06 13:51:02 -0700 | [diff] [blame] | 46 | // Parses the current field value as a double. Returns false if it failed |
| 47 | // to parse. |
| 48 | bool FieldAsDouble(double *value); |
| 49 | |
| 50 | // Returns true if we are at the end of the input. |
| 51 | bool AtEnd() { return data_.size() == 0; } |
| 52 | |
James Kuszmaul | 3ae4226 | 2019-11-08 12:33:41 -0800 | [diff] [blame] | 53 | const std::string_view data_left() const { return data_; } |
Austin Schuh | d7e252d | 2019-10-06 13:51:02 -0700 | [diff] [blame] | 54 | |
| 55 | private: |
| 56 | // Consumes a single character. |
| 57 | void ConsumeChar() { data_ = data_.substr(1); } |
| 58 | |
| 59 | // Returns the current character. |
| 60 | char Char() const { return data_[0]; } |
| 61 | |
| 62 | // Consumes a string out of data_. Populates s with the string. Returns true |
| 63 | // if a valid string was found, and false otherwise. |
| 64 | // data_ is updated only on success. |
| 65 | bool ConsumeString(::std::string *s); |
| 66 | // Consumes a number out of data_. Populates s with the string containing the |
| 67 | // number. Returns true if a valid number was found, and false otherwise. |
| 68 | // data_ is updated only on success. |
| 69 | bool ConsumeNumber(::std::string *s); |
| 70 | // Consumes a fixed token out of data_. Returns true if the string was found, |
| 71 | // and false otherwise. |
| 72 | // data_ is updated only on success. |
| 73 | bool Consume(const char *token); |
| 74 | // Consumes whitespace out of data_. Returns true if the string was found, |
| 75 | // and false otherwise. |
| 76 | // data_ is unconditionally updated. |
| 77 | void ConsumeWhitespace(); |
Pallavi Madhukar | e2eb281 | 2022-07-19 09:56:09 -0700 | [diff] [blame] | 78 | // Consumes a unicode out of data_. Populates s with the unicode. Returns |
| 79 | // true if a valid unicode was found, and false otherwise. data_ is updated |
| 80 | // only on success. |
| 81 | bool ConsumeUnicode(::std::string *s); |
Austin Schuh | d7e252d | 2019-10-06 13:51:02 -0700 | [diff] [blame] | 82 | |
| 83 | // State for the parsing state machine. |
| 84 | enum class State { |
| 85 | kExpectField, |
| 86 | kExpectObjectStart, |
| 87 | kExpectObjectEnd, |
| 88 | kExpectArrayEnd, |
| 89 | kExpectValue, |
| 90 | kExpectEnd, |
| 91 | }; |
| 92 | |
| 93 | State state_ = State::kExpectObjectStart; |
| 94 | |
| 95 | // Data pointer. |
James Kuszmaul | 3ae4226 | 2019-11-08 12:33:41 -0800 | [diff] [blame] | 96 | std::string_view data_; |
Austin Schuh | d7e252d | 2019-10-06 13:51:02 -0700 | [diff] [blame] | 97 | // Current line number used for printing debug. |
| 98 | int linenumber_ = 0; |
| 99 | |
Pallavi Madhukar | e2eb281 | 2022-07-19 09:56:09 -0700 | [diff] [blame] | 100 | // Surrogate pairs i.e. high surrogates (\ud000 - \ud8ff) combined |
| 101 | // with low surrogates (\udc00 - \udfff) cannot be interpreted when |
| 102 | // they do not appear as a part of the pair. |
| 103 | int unicode_high_surrogate_ = -1; |
| 104 | |
Austin Schuh | d7e252d | 2019-10-06 13:51:02 -0700 | [diff] [blame] | 105 | // Stack used to track which object type we were in when we recursed. |
| 106 | enum class ObjectType { |
| 107 | kObject, |
| 108 | kArray, |
| 109 | }; |
| 110 | ::std::vector<ObjectType> object_type_; |
| 111 | |
| 112 | // Last field name. |
| 113 | ::std::string field_name_; |
| 114 | // Last field value. |
| 115 | ::std::string field_value_; |
| 116 | }; |
| 117 | |
| 118 | } // namespace aos |
| 119 | |
| 120 | #endif // AOS_JSON_TOKENIZER_H_ |