Add unicode parsing for JSON strings Parses unicode in JSON strings. Syntax of the unicode in the string should be \uxxxx. Also checks the validity of the unicode. Change-Id: Ie6aa16ef1a67110c02f0374fe04edd9004a12c9b Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>

commit: e2eb2815358c5e90871d035b9f85f557bc8b7ce0 [log] [tgz]
author: Pallavi Madhukar <pallavi.madhukar@bluerivertech.com> Tue Jul 19 09:56:09 2022 -0700
committer: James Kuszmaul <james.kuszmaul@bluerivertech.com> Fri Aug 12 15:40:15 2022 -0700
tree: cab55905fccaa1ef21bb0e30bb793ecba5f3d460
parent: 0168ec3f8fc12af906d5145a5d9e072dd7bab160 [diff] [blame]
diff --git a/aos/json_tokenizer.cc b/aos/json_tokenizer.cc
index 0e235e6..b3c6620 100644
--- a/aos/json_tokenizer.cc
+++ b/aos/json_tokenizer.cc

@@ -91,6 +91,11 @@
       // " is the end, declare victory.
       if (Char() == '"') {
         ConsumeChar();
+        if (unicode_high_surrogate_ != -1) {
+          fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
+          data_ = original;
+          return false;
+        }
         return true;
       } else {
         ConsumeChar();
@@ -116,11 +121,11 @@
         } else if (Char() == 't') {
           *s += "\t";
         } else if (Char() == 'u') {
-          // TODO(austin): Unicode should be valid, but I really don't care to
-          // do this now...
-          fprintf(stderr, "Unexpected unicode on line %d\n", linenumber_);
-          data_ = original;
-          return false;
+          if (!ConsumeUnicode(s)) {
+            fprintf(stderr, "Invalid unicode on line %d\n", linenumber_);
+            data_ = original;
+            return false;
+          }
         }
       }
       // And skip the escaped character.
@@ -131,6 +136,68 @@
   }
 }
 
+bool Tokenizer::ConsumeUnicode(::std::string *s) {
+  // Under no conditions is it acceptible to run out of data while parsing a
+  // unicode.  Any AtEnd checks should confirm that.
+  uint32_t val;
+
+  // Consume unicode representation
+  ConsumeChar();
+
+  char target[5];
+
+  // Valid unicode is 4 hex digits so evaluate the next 4 characters
+  for (int count = 0; count < 4; count++) {
+    // If there is no data or data is an invalid char, return false
+    if (AtEnd()) {
+      return false;
+    }
+
+    if (!isxdigit(Char())) {
+      return false;
+    }
+
+    target[count] = Char();
+
+    // Do not consume the last character
+    if (count == 3) {
+      break;
+    }
+
+    ConsumeChar();
+  }
+  target[4] = '\0';
+
+  // References: flatbuffers/src/idl_parser.cpp
+  val = flatbuffers::StringToUInt(target, 16);
+
+  if (val >= 0xD800 && val <= 0xDBFF) {
+    if (unicode_high_surrogate_ != -1) {
+      fprintf(stderr, "Invalid unicode - Multiple high surrogates\n");
+      return false;
+    } else {
+      unicode_high_surrogate_ = static_cast<int>(val);
+    }
+  } else if (val >= 0xDC00 && val <= 0xDFFF) {
+    if (unicode_high_surrogate_ == -1) {
+      fprintf(stderr, "Invalid unicode - Unpaired low surrogate\n");
+      return false;
+    } else {
+      int code_point =
+          0x10000 + ((unicode_high_surrogate_ & 0x03FF) << 10) + (val & 0x03FF);
+      flatbuffers::ToUTF8(code_point, s);
+      unicode_high_surrogate_ = -1;
+    }
+  } else {
+    if (unicode_high_surrogate_ != -1) {
+      fprintf(stderr, "Invalid unicode - Unpaired high surrogate\n");
+      return false;
+    }
+    flatbuffers::ToUTF8(static_cast<int>(val), s);
+  }
+  return true;
+}
+
 bool Tokenizer::ConsumeNumber(::std::string *s) {
   // Under no conditions is it acceptible to run out of data while parsing a
   // number.  Any AtEnd() checks should confirm that.
commit	e2eb2815358c5e90871d035b9f85f557bc8b7ce0	[log] [tgz]
author	Pallavi Madhukar <pallavi.madhukar@bluerivertech.com>	Tue Jul 19 09:56:09 2022 -0700
committer	James Kuszmaul <james.kuszmaul@bluerivertech.com>	Fri Aug 12 15:40:15 2022 -0700
tree	cab55905fccaa1ef21bb0e30bb793ecba5f3d460
parent	0168ec3f8fc12af906d5145a5d9e072dd7bab160 [diff] [blame]