blob: efb6811f8c58013afaa1ea4c3e935cd21f95ea79 [file] [log] [blame]
Austin Schuhe89fa2d2019-08-14 20:24:23 -07001/*
2 * Copyright 2014 Google Inc. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.flatbuffers;
18
19import java.nio.ByteBuffer;
20
21import static java.lang.Character.MIN_HIGH_SURROGATE;
22import static java.lang.Character.MIN_LOW_SURROGATE;
23import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
24
25public abstract class Utf8 {
26
27 /**
28 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
29 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
30 * both time and space.
31 *
32 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
33 * surrogates)
34 */
35 public abstract int encodedLength(CharSequence sequence);
36
37 /**
38 * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
39 *
40 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
41 * and the capabilities of the platform.
42 *
43 * @param in the source string to be encoded
44 * @param out the target buffer to receive the encoded string.
45 */
46 public abstract void encodeUtf8(CharSequence in, ByteBuffer out);
47
48 /**
49 * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
50 *
51 * @throws IllegalArgumentException if the input is not valid UTF-8.
52 */
53 public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length);
54
55 private static Utf8 DEFAULT;
56
57 /**
58 * Get the default UTF-8 processor.
59 * @return the default processor
60 */
61 public static Utf8 getDefault() {
62 if (DEFAULT == null) {
63 DEFAULT = new Utf8Safe();
64 }
65 return DEFAULT;
66 }
67
68 /**
69 * Set the default instance of the UTF-8 processor.
70 * @param instance the new instance to use
71 */
72 public static void setDefault(Utf8 instance) {
73 DEFAULT = instance;
74 }
75
76 /**
77 * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
78 * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
79 * checks and codepoint conversion happen in this class.
80 */
81 static class DecodeUtil {
82
83 /**
84 * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
85 */
86 static boolean isOneByte(byte b) {
87 return b >= 0;
88 }
89
90 /**
91 * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
92 */
93 static boolean isTwoBytes(byte b) {
94 return b < (byte) 0xE0;
95 }
96
97 /**
98 * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
99 */
100 static boolean isThreeBytes(byte b) {
101 return b < (byte) 0xF0;
102 }
103
104 static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
105 resultArr[resultPos] = (char) byte1;
106 }
107
108 static void handleTwoBytes(
109 byte byte1, byte byte2, char[] resultArr, int resultPos)
110 throws IllegalArgumentException {
111 // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
112 // overlong 2-byte, '11000001'.
113 if (byte1 < (byte) 0xC2) {
114 throw new IllegalArgumentException("Invalid UTF-8: Illegal leading byte in 2 bytes utf");
115 }
116 if (isNotTrailingByte(byte2)) {
117 throw new IllegalArgumentException("Invalid UTF-8: Illegal trailing byte in 2 bytes utf");
118 }
119 resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
120 }
121
122 static void handleThreeBytes(
123 byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
124 throws IllegalArgumentException {
125 if (isNotTrailingByte(byte2)
126 // overlong? 5 most significant bits must not all be zero
127 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
128 // check for illegal surrogate codepoints
129 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
130 || isNotTrailingByte(byte3)) {
131 throw new IllegalArgumentException("Invalid UTF-8");
132 }
133 resultArr[resultPos] = (char)
134 (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
135 }
136
137 static void handleFourBytes(
138 byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
139 throws IllegalArgumentException{
140 if (isNotTrailingByte(byte2)
141 // Check that 1 <= plane <= 16. Tricky optimized form of:
142 // valid 4-byte leading byte?
143 // if (byte1 > (byte) 0xF4 ||
144 // overlong? 4 most significant bits must not all be zero
145 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
146 // codepoint larger than the highest code point (U+10FFFF)?
147 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
148 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
149 || isNotTrailingByte(byte3)
150 || isNotTrailingByte(byte4)) {
151 throw new IllegalArgumentException("Invalid UTF-8");
152 }
153 int codepoint = ((byte1 & 0x07) << 18)
154 | (trailingByteValue(byte2) << 12)
155 | (trailingByteValue(byte3) << 6)
156 | trailingByteValue(byte4);
157 resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
158 resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
159 }
160
161 /**
162 * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
163 */
164 private static boolean isNotTrailingByte(byte b) {
165 return b > (byte) 0xBF;
166 }
167
168 /**
169 * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
170 */
171 private static int trailingByteValue(byte b) {
172 return b & 0x3F;
173 }
174
175 private static char highSurrogate(int codePoint) {
176 return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
177 + (codePoint >>> 10));
178 }
179
180 private static char lowSurrogate(int codePoint) {
181 return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
182 }
183 }
184
185 // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw
186 // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
187 // fallback to more lenient behavior.
188 static class UnpairedSurrogateException extends IllegalArgumentException {
189 UnpairedSurrogateException(int index, int length) {
190 super("Unpaired surrogate at index " + index + " of " + length);
191 }
192 }
193}