blob: ab928f4414125a78e63b11a4d0a47cd43a3e02f2 [file] [log] [blame]
Austin Schuhd7e252d2019-10-06 13:51:02 -07001#ifndef AOS_JSON_TOKENIZER_H_
2#define AOS_JSON_TOKENIZER_H_
3
4#include <string>
James Kuszmaul3ae42262019-11-08 12:33:41 -08005#include <string_view>
Austin Schuhd7e252d2019-10-06 13:51:02 -07006#include <vector>
Philipp Schrader790cb542023-07-05 21:06:52 -07007
James Kuszmaul768c4682023-10-12 21:07:16 -07008#include "absl/strings/numbers.h"
Pallavi Madhukare2eb2812022-07-19 09:56:09 -07009#include "flatbuffers/util.h"
Austin Schuhd7e252d2019-10-06 13:51:02 -070010
Austin Schuhd7e252d2019-10-06 13:51:02 -070011namespace aos {
12
13// This class implements the state machine at json.org
14//
15// The only modification is that it supports C++ comments /**/ in all
16// whitespace.
17class Tokenizer {
18 public:
James Kuszmaul3ae42262019-11-08 12:33:41 -080019 Tokenizer(const std::string_view data) : data_(data) {}
Austin Schuhd7e252d2019-10-06 13:51:02 -070020
21 enum class TokenType {
22 kEnd,
23 kError,
24 kStartObject,
25 kEndObject,
26 kStartArray,
27 kEndArray,
28 kField,
29 kNumberValue,
30 kStringValue,
31 kTrueValue,
32 kFalseValue,
33 };
34
35 // Returns the next token.
36 TokenType Next();
37
38 // Returns the last field_name and field_value. These are only valid when
39 // Next returns them.
40 const ::std::string &field_name() const { return field_name_; }
41 const ::std::string &field_value() const { return field_value_; }
42
43 // Parses the current field value as a long long. Returns false if it failed
44 // to parse.
James Kuszmaul768c4682023-10-12 21:07:16 -070045 bool FieldAsInt(absl::int128 *value);
Austin Schuhd7e252d2019-10-06 13:51:02 -070046 // Parses the current field value as a double. Returns false if it failed
47 // to parse.
48 bool FieldAsDouble(double *value);
49
50 // Returns true if we are at the end of the input.
51 bool AtEnd() { return data_.size() == 0; }
52
James Kuszmaul3ae42262019-11-08 12:33:41 -080053 const std::string_view data_left() const { return data_; }
Austin Schuhd7e252d2019-10-06 13:51:02 -070054
55 private:
56 // Consumes a single character.
57 void ConsumeChar() { data_ = data_.substr(1); }
58
59 // Returns the current character.
60 char Char() const { return data_[0]; }
61
62 // Consumes a string out of data_. Populates s with the string. Returns true
63 // if a valid string was found, and false otherwise.
64 // data_ is updated only on success.
65 bool ConsumeString(::std::string *s);
66 // Consumes a number out of data_. Populates s with the string containing the
67 // number. Returns true if a valid number was found, and false otherwise.
68 // data_ is updated only on success.
69 bool ConsumeNumber(::std::string *s);
70 // Consumes a fixed token out of data_. Returns true if the string was found,
71 // and false otherwise.
72 // data_ is updated only on success.
73 bool Consume(const char *token);
74 // Consumes whitespace out of data_. Returns true if the string was found,
75 // and false otherwise.
76 // data_ is unconditionally updated.
77 void ConsumeWhitespace();
Pallavi Madhukare2eb2812022-07-19 09:56:09 -070078 // Consumes a unicode out of data_. Populates s with the unicode. Returns
79 // true if a valid unicode was found, and false otherwise. data_ is updated
80 // only on success.
81 bool ConsumeUnicode(::std::string *s);
Austin Schuhd7e252d2019-10-06 13:51:02 -070082
83 // State for the parsing state machine.
84 enum class State {
85 kExpectField,
86 kExpectObjectStart,
87 kExpectObjectEnd,
88 kExpectArrayEnd,
89 kExpectValue,
90 kExpectEnd,
91 };
92
93 State state_ = State::kExpectObjectStart;
94
95 // Data pointer.
James Kuszmaul3ae42262019-11-08 12:33:41 -080096 std::string_view data_;
Austin Schuhd7e252d2019-10-06 13:51:02 -070097 // Current line number used for printing debug.
98 int linenumber_ = 0;
99
Pallavi Madhukare2eb2812022-07-19 09:56:09 -0700100 // Surrogate pairs i.e. high surrogates (\ud000 - \ud8ff) combined
101 // with low surrogates (\udc00 - \udfff) cannot be interpreted when
102 // they do not appear as a part of the pair.
103 int unicode_high_surrogate_ = -1;
104
Austin Schuhd7e252d2019-10-06 13:51:02 -0700105 // Stack used to track which object type we were in when we recursed.
106 enum class ObjectType {
107 kObject,
108 kArray,
109 };
110 ::std::vector<ObjectType> object_type_;
111
112 // Last field name.
113 ::std::string field_name_;
114 // Last field value.
115 ::std::string field_value_;
116};
117
118} // namespace aos
119
120#endif // AOS_JSON_TOKENIZER_H_