Add unicode parsing for JSON strings
Parses unicode in JSON strings. Syntax of the unicode in the string
should be \uxxxx. Also checks the validity of the unicode.
Change-Id: Ie6aa16ef1a67110c02f0374fe04edd9004a12c9b
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/BUILD b/aos/BUILD
index 1380f7d..b7aef88 100644
--- a/aos/BUILD
+++ b/aos/BUILD
@@ -341,6 +341,7 @@
hdrs = ["json_tokenizer.h"],
target_compatible_with = ["@platforms//os:linux"],
deps = [
+ "@com_github_google_flatbuffers//:flatbuffers",
"@com_github_google_glog//:glog",
"@com_google_absl//absl/strings",
],
diff --git a/aos/json_to_flatbuffer_test.cc b/aos/json_to_flatbuffer_test.cc
index 42457f6..4918281 100644
--- a/aos/json_to_flatbuffer_test.cc
+++ b/aos/json_to_flatbuffer_test.cc
@@ -97,6 +97,18 @@
EXPECT_TRUE(JsonAndBack("{ \"foo_double\": -nan }"));
}
+// Tests that unicode is handled correctly
+TEST_F(JsonToFlatbufferTest, Unicode) {
+ EXPECT_TRUE(JsonAndBack("{ \"foo_string\": \"\\uF672\" }"));
+ EXPECT_TRUE(JsonAndBack("{ \"foo_string\": \"\\uEFEF\" }"));
+ EXPECT_TRUE(JsonAndBack("{ \"foo_string\": \"helloworld\\uD83E\\uDE94\" }"));
+ EXPECT_TRUE(JsonAndBack("{ \"foo_string\": \"\\uD83C\\uDF32\" }"));
+ EXPECT_FALSE(JsonAndBack("{ \"foo_string\": \"\\uP890\" }"));
+ EXPECT_FALSE(JsonAndBack("{ \"foo_string\": \"\\u!FA8\" }"));
+ EXPECT_FALSE(JsonAndBack("{ \"foo_string\": \"\\uF89\" }"));
+ EXPECT_FALSE(JsonAndBack("{ \"foo_string\": \"\\uD83C\" }"));
+}
+
// Tests that we can handle decimal points.
TEST_F(JsonToFlatbufferTest, DecimalPoint) {
EXPECT_TRUE(JsonAndBack("{ \"foo_float\": 5.1 }"));
diff --git a/aos/json_tokenizer.cc b/aos/json_tokenizer.cc
index 0e235e6..b3c6620 100644
--- a/aos/json_tokenizer.cc
+++ b/aos/json_tokenizer.cc
@@ -91,6 +91,11 @@
// " is the end, declare victory.
if (Char() == '"') {
ConsumeChar();
+ if (unicode_high_surrogate_ != -1) {
+ fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
+ data_ = original;
+ return false;
+ }
return true;
} else {
ConsumeChar();
@@ -116,11 +121,11 @@
} else if (Char() == 't') {
*s += "\t";
} else if (Char() == 'u') {
- // TODO(austin): Unicode should be valid, but I really don't care to
- // do this now...
- fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
- data_ = original;
- return false;
+ if (!ConsumeUnicode(s)) {
+ fprintf(stderr, "Invalid unicode on line %d\n", linenumber_);
+ data_ = original;
+ return false;
+ }
}
}
// And skip the escaped character.
@@ -131,6 +136,68 @@
}
}
+bool Tokenizer::ConsumeUnicode(::std::string *s) {
+ // Under no conditions is it acceptible to run out of data while parsing a
+ // unicode. Any AtEnd checks should confirm that.
+ uint32_t val;
+
+ // Consume unicode representation
+ ConsumeChar();
+
+ char target[5];
+
+ // Valid unicode is 4 hex digits so evaluate the next 4 characters
+ for (int count = 0; count < 4; count++) {
+ // If there is no data or data is an invalid char, return false
+ if (AtEnd()) {
+ return false;
+ }
+
+ if (!isxdigit(Char())) {
+ return false;
+ }
+
+ target[count] = Char();
+
+ // Do not consume the last character
+ if (count == 3) {
+ break;
+ }
+
+ ConsumeChar();
+ }
+ target[4] = '\0';
+
+ // References: flatbuffers/src/idl_parser.cpp
+ val = flatbuffers::StringToUInt(target, 16);
+
+ if (val >= 0xD800 && val <= 0xDBFF) {
+ if (unicode_high_surrogate_ != -1) {
+ fprintf(stderr, "Invalid unicode - Multiple high surrogates\n");
+ return false;
+ } else {
+ unicode_high_surrogate_ = static_cast<int>(val);
+ }
+ } else if (val >= 0xDC00 && val <= 0xDFFF) {
+ if (unicode_high_surrogate_ == -1) {
+ fprintf(stderr, "Invalid unicode - Unpaired low surrogate\n");
+ return false;
+ } else {
+ int code_point =
+ 0x10000 + ((unicode_high_surrogate_ & 0x03FF) << 10) + (val & 0x03FF);
+ flatbuffers::ToUTF8(code_point, s);
+ unicode_high_surrogate_ = -1;
+ }
+ } else {
+ if (unicode_high_surrogate_ != -1) {
+ fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
+ return false;
+ }
+ flatbuffers::ToUTF8(static_cast<int>(val), s);
+ }
+ return true;
+}
+
bool Tokenizer::ConsumeNumber(::std::string *s) {
// Under no conditions is it acceptible to run out of data while parsing a
// number. Any AtEnd() checks should confirm that.
diff --git a/aos/json_tokenizer.h b/aos/json_tokenizer.h
index 5b41da1..dec2a32 100644
--- a/aos/json_tokenizer.h
+++ b/aos/json_tokenizer.h
@@ -4,6 +4,7 @@
#include <string>
#include <string_view>
#include <vector>
+#include "flatbuffers/util.h"
namespace aos {
@@ -72,6 +73,10 @@
// and false otherwise.
// data_ is unconditionally updated.
void ConsumeWhitespace();
+ // Consumes a unicode out of data_. Populates s with the unicode. Returns
+ // true if a valid unicode was found, and false otherwise. data_ is updated
+ // only on success.
+ bool ConsumeUnicode(::std::string *s);
// State for the parsing state machine.
enum class State {
@@ -90,6 +95,11 @@
// Current line number used for printing debug.
int linenumber_ = 0;
+ // Surrogate pairs i.e. high surrogates (\ud000 - \ud8ff) combined
+ // with low surrogates (\udc00 - \udfff) cannot be interpreted when
+ // they do not appear as a part of the pair.
+ int unicode_high_surrogate_ = -1;
+
// Stack used to track which object type we were in when we recursed.
enum class ObjectType {
kObject,