blob: e8af8ad6c7f9ad1cf066960dd5497f3448216f67 [file] [log] [blame]
Austin Schuhe89fa2d2019-08-14 20:24:23 -07001/*
2 * Copyright 2014 Google Inc. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.flatbuffers;
18
19import java.nio.ByteBuffer;
20
Austin Schuh272c6132020-11-14 16:37:52 -080021import static java.lang.Character.MAX_SURROGATE;
22import static java.lang.Character.MIN_SURROGATE;
Austin Schuhe89fa2d2019-08-14 20:24:23 -070023import static java.lang.Character.MIN_HIGH_SURROGATE;
24import static java.lang.Character.MIN_LOW_SURROGATE;
25import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
Austin Schuh272c6132020-11-14 16:37:52 -080026import static java.lang.Character.isSurrogatePair;
27import static java.lang.Character.toCodePoint;
Austin Schuhe89fa2d2019-08-14 20:24:23 -070028
29public abstract class Utf8 {
30
31 /**
32 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
33 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
34 * both time and space.
35 *
36 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
37 * surrogates)
38 */
39 public abstract int encodedLength(CharSequence sequence);
40
41 /**
42 * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
43 *
44 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
45 * and the capabilities of the platform.
46 *
47 * @param in the source string to be encoded
48 * @param out the target buffer to receive the encoded string.
49 */
50 public abstract void encodeUtf8(CharSequence in, ByteBuffer out);
51
52 /**
53 * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
54 *
55 * @throws IllegalArgumentException if the input is not valid UTF-8.
56 */
57 public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length);
58
59 private static Utf8 DEFAULT;
60
61 /**
62 * Get the default UTF-8 processor.
63 * @return the default processor
64 */
65 public static Utf8 getDefault() {
66 if (DEFAULT == null) {
67 DEFAULT = new Utf8Safe();
68 }
69 return DEFAULT;
70 }
71
72 /**
73 * Set the default instance of the UTF-8 processor.
74 * @param instance the new instance to use
75 */
76 public static void setDefault(Utf8 instance) {
77 DEFAULT = instance;
78 }
79
80 /**
Austin Schuh272c6132020-11-14 16:37:52 -080081 * Encode a Java's CharSequence UTF8 codepoint into a byte array.
82 * @param in CharSequence to be encoded
83 * @param start start position of the first char in the codepoint
84 * @param out byte array of 4 bytes to be filled
85 * @return return the amount of bytes occupied by the codepoint
86 */
87 public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) {
88 // utf8 codepoint needs at least 4 bytes
89 assert out.length >= 4;
90
91 final int inLength = in.length();
92 if (start >= inLength) {
93 return 0;
94 }
95
96 char c = in.charAt(start);
97 if (c < 0x80) {
98 // One byte (0xxx xxxx)
99 out[0] = (byte) c;
100 return 1;
101 } else if (c < 0x800) {
102 // Two bytes (110x xxxx 10xx xxxx)
103 out[0] = (byte) (0xC0 | (c >>> 6));
104 out[1] = (byte) (0x80 | (0x3F & c));
105 return 2;
106 } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
107 // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
108 // Maximum single-char code point is 0xFFFF, 16 bits.
109 out[0] = (byte) (0xE0 | (c >>> 12));
110 out[1] =(byte) (0x80 | (0x3F & (c >>> 6)));
111 out[2] = (byte) (0x80 | (0x3F & c));
112 return 3;
113 } else {
114 // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
115 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
116 // bytes
117 final char low;
118 if (start + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(start+1)))) {
119 throw new UnpairedSurrogateException(start, inLength);
120 }
121 int codePoint = toCodePoint(c, low);
122 out[0] = (byte) ((0xF << 4) | (codePoint >>> 18));
123 out[1] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
124 out[2] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
125 out[3] = (byte) (0x80 | (0x3F & codePoint));
126 return 4;
127 }
128 }
129
130 /**
Austin Schuhe89fa2d2019-08-14 20:24:23 -0700131 * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
132 * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
133 * checks and codepoint conversion happen in this class.
134 */
135 static class DecodeUtil {
136
137 /**
138 * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
139 */
140 static boolean isOneByte(byte b) {
141 return b >= 0;
142 }
143
144 /**
145 * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
146 */
147 static boolean isTwoBytes(byte b) {
148 return b < (byte) 0xE0;
149 }
150
151 /**
152 * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
153 */
154 static boolean isThreeBytes(byte b) {
155 return b < (byte) 0xF0;
156 }
157
158 static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
159 resultArr[resultPos] = (char) byte1;
160 }
161
162 static void handleTwoBytes(
163 byte byte1, byte byte2, char[] resultArr, int resultPos)
164 throws IllegalArgumentException {
165 // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
166 // overlong 2-byte, '11000001'.
167 if (byte1 < (byte) 0xC2) {
168 throw new IllegalArgumentException("Invalid UTF-8: Illegal leading byte in 2 bytes utf");
169 }
170 if (isNotTrailingByte(byte2)) {
171 throw new IllegalArgumentException("Invalid UTF-8: Illegal trailing byte in 2 bytes utf");
172 }
173 resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
174 }
175
176 static void handleThreeBytes(
177 byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
178 throws IllegalArgumentException {
179 if (isNotTrailingByte(byte2)
180 // overlong? 5 most significant bits must not all be zero
181 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
182 // check for illegal surrogate codepoints
183 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
184 || isNotTrailingByte(byte3)) {
185 throw new IllegalArgumentException("Invalid UTF-8");
186 }
187 resultArr[resultPos] = (char)
188 (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
189 }
190
191 static void handleFourBytes(
192 byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
193 throws IllegalArgumentException{
194 if (isNotTrailingByte(byte2)
195 // Check that 1 <= plane <= 16. Tricky optimized form of:
196 // valid 4-byte leading byte?
197 // if (byte1 > (byte) 0xF4 ||
198 // overlong? 4 most significant bits must not all be zero
199 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
200 // codepoint larger than the highest code point (U+10FFFF)?
201 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
202 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
203 || isNotTrailingByte(byte3)
204 || isNotTrailingByte(byte4)) {
205 throw new IllegalArgumentException("Invalid UTF-8");
206 }
207 int codepoint = ((byte1 & 0x07) << 18)
208 | (trailingByteValue(byte2) << 12)
209 | (trailingByteValue(byte3) << 6)
210 | trailingByteValue(byte4);
211 resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
212 resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
213 }
214
215 /**
216 * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
217 */
218 private static boolean isNotTrailingByte(byte b) {
219 return b > (byte) 0xBF;
220 }
221
222 /**
223 * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
224 */
225 private static int trailingByteValue(byte b) {
226 return b & 0x3F;
227 }
228
229 private static char highSurrogate(int codePoint) {
230 return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
231 + (codePoint >>> 10));
232 }
233
234 private static char lowSurrogate(int codePoint) {
235 return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
236 }
237 }
238
239 // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw
240 // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
241 // fallback to more lenient behavior.
242 static class UnpairedSurrogateException extends IllegalArgumentException {
243 UnpairedSurrogateException(int index, int length) {
244 super("Unpaired surrogate at index " + index + " of " + length);
245 }
246 }
247}