Add unicode parsing for JSON strings Parses unicode in JSON strings. Syntax of the unicode in the string should be \uxxxx. Also checks the validity of the unicode. Change-Id: Ie6aa16ef1a67110c02f0374fe04edd9004a12c9b Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>

commit: e2eb2815358c5e90871d035b9f85f557bc8b7ce0 [log] [tgz]
author: Pallavi Madhukar <pallavi.madhukar@bluerivertech.com> Tue Jul 19 09:56:09 2022 -0700
committer: James Kuszmaul <james.kuszmaul@bluerivertech.com> Fri Aug 12 15:40:15 2022 -0700
tree: cab55905fccaa1ef21bb0e30bb793ecba5f3d460
parent: 0168ec3f8fc12af906d5145a5d9e072dd7bab160 [diff]
diff --git a/aos/BUILD b/aos/BUILD
index 1380f7d..b7aef88 100644
--- a/aos/BUILD
+++ b/aos/BUILD

@@ -341,6 +341,7 @@
     hdrs = ["json_tokenizer.h"],
     target_compatible_with = ["@platforms//os:linux"],
     deps = [
+        "@com_github_google_flatbuffers//:flatbuffers",
         "@com_github_google_glog//:glog",
         "@com_google_absl//absl/strings",
     ],

diff --git a/aos/json_to_flatbuffer_test.cc b/aos/json_to_flatbuffer_test.cc
index 42457f6..4918281 100644
--- a/aos/json_to_flatbuffer_test.cc
+++ b/aos/json_to_flatbuffer_test.cc

@@ -97,6 +97,18 @@
   EXPECT_TRUE(JsonAndBack("{ \"foo_double\": -nan }"));
 }
 
+// Tests that unicode is handled correctly
+TEST_F(JsonToFlatbufferTest, Unicode) {
+  EXPECT_TRUE(JsonAndBack("{ \"foo_string\": \"\\uF672\" }"));
+  EXPECT_TRUE(JsonAndBack("{ \"foo_string\": \"\\uEFEF\" }"));
+  EXPECT_TRUE(JsonAndBack("{ \"foo_string\": \"helloworld\\uD83E\\uDE94\" }"));
+  EXPECT_TRUE(JsonAndBack("{ \"foo_string\": \"\\uD83C\\uDF32\" }"));
+  EXPECT_FALSE(JsonAndBack("{ \"foo_string\": \"\\uP890\" }"));
+  EXPECT_FALSE(JsonAndBack("{ \"foo_string\": \"\\u!FA8\" }"));
+  EXPECT_FALSE(JsonAndBack("{ \"foo_string\": \"\\uF89\" }"));
+  EXPECT_FALSE(JsonAndBack("{ \"foo_string\": \"\\uD83C\" }"));
+}
+
 // Tests that we can handle decimal points.
 TEST_F(JsonToFlatbufferTest, DecimalPoint) {
   EXPECT_TRUE(JsonAndBack("{ \"foo_float\": 5.1 }"));

diff --git a/aos/json_tokenizer.cc b/aos/json_tokenizer.cc
index 0e235e6..b3c6620 100644
--- a/aos/json_tokenizer.cc
+++ b/aos/json_tokenizer.cc

@@ -91,6 +91,11 @@
       // " is the end, declare victory.
       if (Char() == '"') {
         ConsumeChar();
+        if (unicode_high_surrogate_ != -1) {
+          fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
+          data_ = original;
+          return false;
+        }
         return true;
       } else {
         ConsumeChar();
@@ -116,11 +121,11 @@
         } else if (Char() == 't') {
           *s += "\t";
         } else if (Char() == 'u') {
-          // TODO(austin): Unicode should be valid, but I really don't care to
-          // do this now...
-          fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
-          data_ = original;
-          return false;
+          if (!ConsumeUnicode(s)) {
+            fprintf(stderr, "Invalid unicode on line %d\n", linenumber_);
+            data_ = original;
+            return false;
+          }
         }
       }
       // And skip the escaped character.
@@ -131,6 +136,68 @@
   }
 }
 
+bool Tokenizer::ConsumeUnicode(::std::string *s) {
+  // Under no conditions is it acceptible to run out of data while parsing a
+  // unicode.  Any AtEnd checks should confirm that.
+  uint32_t val;
+
+  // Consume unicode representation
+  ConsumeChar();
+
+  char target[5];
+
+  // Valid unicode is 4 hex digits so evaluate the next 4 characters
+  for (int count = 0; count < 4; count++) {
+    // If there is no data or data is an invalid char, return false
+    if (AtEnd()) {
+      return false;
+    }
+
+    if (!isxdigit(Char())) {
+      return false;
+    }
+
+    target[count] = Char();
+
+    // Do not consume the last character
+    if (count == 3) {
+      break;
+    }
+
+    ConsumeChar();
+  }
+  target[4] = '\0';
+
+  // References: flatbuffers/src/idl_parser.cpp
+  val = flatbuffers::StringToUInt(target, 16);
+
+  if (val >= 0xD800 && val <= 0xDBFF) {
+    if (unicode_high_surrogate_ != -1) {
+      fprintf(stderr, "Invalid unicode - Multiple high surrogates\n");
+      return false;
+    } else {
+      unicode_high_surrogate_ = static_cast<int>(val);
+    }
+  } else if (val >= 0xDC00 && val <= 0xDFFF) {
+    if (unicode_high_surrogate_ == -1) {
+      fprintf(stderr, "Invalid unicode - Unpaired low surrogate\n");
+      return false;
+    } else {
+      int code_point =
+          0x10000 + ((unicode_high_surrogate_ & 0x03FF) << 10) + (val & 0x03FF);
+      flatbuffers::ToUTF8(code_point, s);
+      unicode_high_surrogate_ = -1;
+    }
+  } else {
+    if (unicode_high_surrogate_ != -1) {
+      fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
+      return false;
+    }
+    flatbuffers::ToUTF8(static_cast<int>(val), s);
+  }
+  return true;
+}
+
 bool Tokenizer::ConsumeNumber(::std::string *s) {
   // Under no conditions is it acceptible to run out of data while parsing a
   // number.  Any AtEnd() checks should confirm that.

diff --git a/aos/json_tokenizer.h b/aos/json_tokenizer.h
index 5b41da1..dec2a32 100644
--- a/aos/json_tokenizer.h
+++ b/aos/json_tokenizer.h

@@ -4,6 +4,7 @@
 #include <string>
 #include <string_view>
 #include <vector>
+#include "flatbuffers/util.h"
 
 namespace aos {
 
@@ -72,6 +73,10 @@
   // and false otherwise.
   // data_ is unconditionally updated.
   void ConsumeWhitespace();
+  // Consumes a unicode out of data_.  Populates s with the unicode.  Returns
+  // true if a valid unicode was found, and false otherwise. data_ is updated
+  // only on success.
+  bool ConsumeUnicode(::std::string *s);
 
   // State for the parsing state machine.
   enum class State {
@@ -90,6 +95,11 @@
   // Current line number used for printing debug.
   int linenumber_ = 0;
 
+  // Surrogate pairs i.e. high surrogates (\ud000 - \ud8ff) combined
+  // with low surrogates (\udc00 - \udfff) cannot be interpreted when
+  // they do not appear as a part of the pair.
+  int unicode_high_surrogate_ = -1;
+
   // Stack used to track which object type we were in when we recursed.
   enum class ObjectType {
     kObject,
commit	e2eb2815358c5e90871d035b9f85f557bc8b7ce0	[log] [tgz]
author	Pallavi Madhukar <pallavi.madhukar@bluerivertech.com>	Tue Jul 19 09:56:09 2022 -0700
committer	James Kuszmaul <james.kuszmaul@bluerivertech.com>	Fri Aug 12 15:40:15 2022 -0700
tree	cab55905fccaa1ef21bb0e30bb793ecba5f3d460
parent	0168ec3f8fc12af906d5145a5d9e072dd7bab160 [diff]