Blame - java/com/google/flatbuffers/Utf8Safe.java - RealtimeRoboticsGroup/test

blob: 06ea420bb57ab0bb9f518f5dcfba617cc1ec06c6 [file] [log] [blame]

Austin Schuh	e89fa2d	2019-08-14 20:24:23 -0700	[diff] [blame^]	1	// Protocol Buffers - Google's data interchange format
				2	// Copyright 2008 Google Inc. All rights reserved.
				3	// https://developers.google.com/protocol-buffers/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are
				7	// met:
				8	//
				9	// * Redistributions of source code must retain the above copyright
				10	// notice, this list of conditions and the following disclaimer.
				11	// * Redistributions in binary form must reproduce the above
				12	// copyright notice, this list of conditions and the following disclaimer
				13	// in the documentation and/or other materials provided with the
				14	// distribution.
				15	// * Neither the name of Google Inc. nor the names of its
				16	// contributors may be used to endorse or promote products derived from
				17	// this software without specific prior written permission.
				18	//
				19	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				20	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				21	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				22	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				23	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				24	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				25	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				26	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				27	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				28	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				29	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				30
				31	package com.google.flatbuffers;
				32
				33	import java.nio.ByteBuffer;
				34	import static java.lang.Character.MAX_SURROGATE;
				35	import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
				36	import static java.lang.Character.MIN_SURROGATE;
				37	import static java.lang.Character.isSurrogatePair;
				38	import static java.lang.Character.toCodePoint;
				39
				40	/**
				41	* A set of low-level, high-performance static utility methods related
				42	* to the UTF-8 character encoding. This class has no dependencies
				43	* outside of the core JDK libraries.
				44	*
				45	* <p>There are several variants of UTF-8. The one implemented by
				46	* this class is the restricted definition of UTF-8 introduced in
				47	* Unicode 3.1, which mandates the rejection of "overlong" byte
				48	* sequences as well as rejection of 3-byte surrogate codepoint byte
				49	* sequences. Note that the UTF-8 decoder included in Oracle's JDK
				50	* has been modified to also reject "overlong" byte sequences, but (as
				51	* of 2011) still accepts 3-byte surrogate codepoint byte sequences.
				52	*
				53	* <p>The byte sequences considered valid by this class are exactly
				54	* those that can be roundtrip converted to Strings and back to bytes
				55	* using the UTF-8 charset, without loss: <pre> {@code
				56	* Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8))
				57	* }</pre>
				58	*
				59	* <p>See the Unicode Standard,</br>
				60	* Table 3-6. <em>UTF-8 Bit Distribution</em>,</br>
				61	* Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>.
				62	*/
				63	final public class Utf8Safe extends Utf8 {
				64
				65	/**
				66	* Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
				67	* this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
				68	* both time and space.
				69	*
				70	* @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
				71	* surrogates)
				72	*/
				73	private static int computeEncodedLength(CharSequence sequence) {
				74	// Warning to maintainers: this implementation is highly optimized.
				75	int utf16Length = sequence.length();
				76	int utf8Length = utf16Length;
				77	int i = 0;
				78
				79	// This loop optimizes for pure ASCII.
				80	while (i < utf16Length && sequence.charAt(i) < 0x80) {
				81	i++;
				82	}
				83
				84	// This loop optimizes for chars less than 0x800.
				85	for (; i < utf16Length; i++) {
				86	char c = sequence.charAt(i);
				87	if (c < 0x800) {
				88	utf8Length += ((0x7f - c) >>> 31); // branch free!
				89	} else {
				90	utf8Length += encodedLengthGeneral(sequence, i);
				91	break;
				92	}
				93	}
				94
				95	if (utf8Length < utf16Length) {
				96	// Necessary and sufficient condition for overflow because of maximum 3x expansion
				97	throw new IllegalArgumentException("UTF-8 length does not fit in int: "
				98	+ (utf8Length + (1L << 32)));
				99	}
				100	return utf8Length;
				101	}
				102
				103	private static int encodedLengthGeneral(CharSequence sequence, int start) {
				104	int utf16Length = sequence.length();
				105	int utf8Length = 0;
				106	for (int i = start; i < utf16Length; i++) {
				107	char c = sequence.charAt(i);
				108	if (c < 0x800) {
				109	utf8Length += (0x7f - c) >>> 31; // branch free!
				110	} else {
				111	utf8Length += 2;
				112	// jdk7+: if (Character.isSurrogate(c)) {
				113	if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
				114	// Check that we have a well-formed surrogate pair.
				115	int cp = Character.codePointAt(sequence, i);
				116	if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
				117	throw new Utf8Safe.UnpairedSurrogateException(i, utf16Length);
				118	}
				119	i++;
				120	}
				121	}
				122	}
				123	return utf8Length;
				124	}
				125
				126	private static String decodeUtf8Array(byte[] bytes, int index, int size) {
				127	// Bitwise OR combines the sign bits so any negative value fails the check.
				128	if ((index \| size \| bytes.length - index - size) < 0) {
				129	throw new ArrayIndexOutOfBoundsException(
				130	String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
				131	}
				132
				133	int offset = index;
				134	final int limit = offset + size;
				135
				136	// The longest possible resulting String is the same as the number of input bytes, when it is
				137	// all ASCII. For other cases, this over-allocates and we will truncate in the end.
				138	char[] resultArr = new char[size];
				139	int resultPos = 0;
				140
				141	// Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
				142	// This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
				143	while (offset < limit) {
				144	byte b = bytes[offset];
				145	if (!DecodeUtil.isOneByte(b)) {
				146	break;
				147	}
				148	offset++;
				149	DecodeUtil.handleOneByte(b, resultArr, resultPos++);
				150	}
				151
				152	while (offset < limit) {
				153	byte byte1 = bytes[offset++];
				154	if (DecodeUtil.isOneByte(byte1)) {
				155	DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
				156	// It's common for there to be multiple ASCII characters in a run mixed in, so add an
				157	// extra optimized loop to take care of these runs.
				158	while (offset < limit) {
				159	byte b = bytes[offset];
				160	if (!DecodeUtil.isOneByte(b)) {
				161	break;
				162	}
				163	offset++;
				164	DecodeUtil.handleOneByte(b, resultArr, resultPos++);
				165	}
				166	} else if (DecodeUtil.isTwoBytes(byte1)) {
				167	if (offset >= limit) {
				168	throw new IllegalArgumentException("Invalid UTF-8");
				169	}
				170	DecodeUtil.handleTwoBytes(byte1, /* byte2 */ bytes[offset++], resultArr, resultPos++);
				171	} else if (DecodeUtil.isThreeBytes(byte1)) {
				172	if (offset >= limit - 1) {
				173	throw new IllegalArgumentException("Invalid UTF-8");
				174	}
				175	DecodeUtil.handleThreeBytes(
				176	byte1,
				177	/* byte2 */ bytes[offset++],
				178	/* byte3 */ bytes[offset++],
				179	resultArr,
				180	resultPos++);
				181	} else {
				182	if (offset >= limit - 2) {
				183	throw new IllegalArgumentException("Invalid UTF-8");
				184	}
				185	DecodeUtil.handleFourBytes(
				186	byte1,
				187	/* byte2 */ bytes[offset++],
				188	/* byte3 */ bytes[offset++],
				189	/* byte4 */ bytes[offset++],
				190	resultArr,
				191	resultPos++);
				192	// 4-byte case requires two chars.
				193	resultPos++;
				194	}
				195	}
				196
				197	return new String(resultArr, 0, resultPos);
				198	}
				199
				200	private static String decodeUtf8Buffer(ByteBuffer buffer, int offset,
				201	int length) {
				202	// Bitwise OR combines the sign bits so any negative value fails the check.
				203	if ((offset \| length \| buffer.limit() - offset - length) < 0) {
				204	throw new ArrayIndexOutOfBoundsException(
				205	String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(),
				206	offset, length));
				207	}
				208
				209	final int limit = offset + length;
				210
				211	// The longest possible resulting String is the same as the number of input bytes, when it is
				212	// all ASCII. For other cases, this over-allocates and we will truncate in the end.
				213	char[] resultArr = new char[length];
				214	int resultPos = 0;
				215
				216	// Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
				217	// This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
				218	while (offset < limit) {
				219	byte b = buffer.get(offset);
				220	if (!DecodeUtil.isOneByte(b)) {
				221	break;
				222	}
				223	offset++;
				224	DecodeUtil.handleOneByte(b, resultArr, resultPos++);
				225	}
				226
				227	while (offset < limit) {
				228	byte byte1 = buffer.get(offset++);
				229	if (DecodeUtil.isOneByte(byte1)) {
				230	DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
				231	// It's common for there to be multiple ASCII characters in a run mixed in, so add an
				232	// extra optimized loop to take care of these runs.
				233	while (offset < limit) {
				234	byte b = buffer.get(offset);
				235	if (!DecodeUtil.isOneByte(b)) {
				236	break;
				237	}
				238	offset++;
				239	DecodeUtil.handleOneByte(b, resultArr, resultPos++);
				240	}
				241	} else if (DecodeUtil.isTwoBytes(byte1)) {
				242	if (offset >= limit) {
				243	throw new IllegalArgumentException("Invalid UTF-8");
				244	}
				245	DecodeUtil.handleTwoBytes(
				246	byte1, /* byte2 */ buffer.get(offset++), resultArr, resultPos++);
				247	} else if (DecodeUtil.isThreeBytes(byte1)) {
				248	if (offset >= limit - 1) {
				249	throw new IllegalArgumentException("Invalid UTF-8");
				250	}
				251	DecodeUtil.handleThreeBytes(
				252	byte1,
				253	/* byte2 */ buffer.get(offset++),
				254	/* byte3 */ buffer.get(offset++),
				255	resultArr,
				256	resultPos++);
				257	} else {
				258	if (offset >= limit - 2) {
				259	throw new IllegalArgumentException("Invalid UTF-8");
				260	}
				261	DecodeUtil.handleFourBytes(
				262	byte1,
				263	/* byte2 */ buffer.get(offset++),
				264	/* byte3 */ buffer.get(offset++),
				265	/* byte4 */ buffer.get(offset++),
				266	resultArr,
				267	resultPos++);
				268	// 4-byte case requires two chars.
				269	resultPos++;
				270	}
				271	}
				272
				273	return new String(resultArr, 0, resultPos);
				274	}
				275
				276	@Override
				277	public int encodedLength(CharSequence in) {
				278	return computeEncodedLength(in);
				279	}
				280
				281	/**
				282	* Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
				283	*
				284	* @throws IllegalArgumentException if the input is not valid UTF-8.
				285	*/
				286	@Override
				287	public String decodeUtf8(ByteBuffer buffer, int offset, int length)
				288	throws IllegalArgumentException {
				289	if (buffer.hasArray()) {
				290	return decodeUtf8Array(buffer.array(), buffer.arrayOffset() + offset, length);
				291	} else {
				292	return decodeUtf8Buffer(buffer, offset, length);
				293	}
				294	}
				295
				296
				297	private static void encodeUtf8Buffer(CharSequence in, ByteBuffer out) {
				298	final int inLength = in.length();
				299	int outIx = out.position();
				300	int inIx = 0;
				301
				302	// Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check
				303	// access. Assume the buffer is big enough and let it handle the out of bounds exception
				304	// if it occurs.
				305	try {
				306	// Designed to take advantage of
				307	// https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
				308	for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) {
				309	out.put(outIx + inIx, (byte) c);
				310	}
				311	if (inIx == inLength) {
				312	// Successfully encoded the entire string.
				313	out.position(outIx + inIx);
				314	return;
				315	}
				316
				317	outIx += inIx;
				318	for (char c; inIx < inLength; ++inIx, ++outIx) {
				319	c = in.charAt(inIx);
				320	if (c < 0x80) {
				321	// One byte (0xxx xxxx)
				322	out.put(outIx, (byte) c);
				323	} else if (c < 0x800) {
				324	// Two bytes (110x xxxx 10xx xxxx)
				325
				326	// Benchmarks show put performs better than putShort here (for HotSpot).
				327	out.put(outIx++, (byte) (0xC0 \| (c >>> 6)));
				328	out.put(outIx, (byte) (0x80 \| (0x3F & c)));
				329	} else if (c < MIN_SURROGATE \|\| MAX_SURROGATE < c) {
				330	// Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
				331	// Maximum single-char code point is 0xFFFF, 16 bits.
				332
				333	// Benchmarks show put performs better than putShort here (for HotSpot).
				334	out.put(outIx++, (byte) (0xE0 \| (c >>> 12)));
				335	out.put(outIx++, (byte) (0x80 \| (0x3F & (c >>> 6))));
				336	out.put(outIx, (byte) (0x80 \| (0x3F & c)));
				337	} else {
				338	// Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
				339
				340	// Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
				341	// bytes
				342	final char low;
				343	if (inIx + 1 == inLength \|\| !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
				344	throw new UnpairedSurrogateException(inIx, inLength);
				345	}
				346	// TODO(nathanmittler): Consider using putInt() to improve performance.
				347	int codePoint = toCodePoint(c, low);
				348	out.put(outIx++, (byte) ((0xF << 4) \| (codePoint >>> 18)));
				349	out.put(outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 12))));
				350	out.put(outIx++, (byte) (0x80 \| (0x3F & (codePoint >>> 6))));
				351	out.put(outIx, (byte) (0x80 \| (0x3F & codePoint)));
				352	}
				353	}
				354
				355	// Successfully encoded the entire string.
				356	out.position(outIx);
				357	} catch (IndexOutOfBoundsException e) {
				358	// TODO(nathanmittler): Consider making the API throw IndexOutOfBoundsException instead.
				359
				360	// If we failed in the outer ASCII loop, outIx will not have been updated. In this case,
				361	// use inIx to determine the bad write index.
				362	int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1);
				363	throw new ArrayIndexOutOfBoundsException(
				364	"Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex);
				365	}
				366	}
				367
				368	private static int encodeUtf8Array(CharSequence in, byte[] out,
				369	int offset, int length) {
				370	int utf16Length = in.length();
				371	int j = offset;
				372	int i = 0;
				373	int limit = offset + length;
				374	// Designed to take advantage of
				375	// https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
				376	for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) {
				377	out[j + i] = (byte) c;
				378	}
				379	if (i == utf16Length) {
				380	return j + utf16Length;
				381	}
				382	j += i;
				383	for (char c; i < utf16Length; i++) {
				384	c = in.charAt(i);
				385	if (c < 0x80 && j < limit) {
				386	out[j++] = (byte) c;
				387	} else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes
				388	out[j++] = (byte) ((0xF << 6) \| (c >>> 6));
				389	out[j++] = (byte) (0x80 \| (0x3F & c));
				390	} else if ((c < Character.MIN_SURROGATE \|\| Character.MAX_SURROGATE < c) && j <= limit - 3) {
				391	// Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
				392	out[j++] = (byte) ((0xF << 5) \| (c >>> 12));
				393	out[j++] = (byte) (0x80 \| (0x3F & (c >>> 6)));
				394	out[j++] = (byte) (0x80 \| (0x3F & c));
				395	} else if (j <= limit - 4) {
				396	// Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
				397	// four UTF-8 bytes
				398	final char low;
				399	if (i + 1 == in.length()
				400	\|\| !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
				401	throw new UnpairedSurrogateException((i - 1), utf16Length);
				402	}
				403	int codePoint = Character.toCodePoint(c, low);
				404	out[j++] = (byte) ((0xF << 4) \| (codePoint >>> 18));
				405	out[j++] = (byte) (0x80 \| (0x3F & (codePoint >>> 12)));
				406	out[j++] = (byte) (0x80 \| (0x3F & (codePoint >>> 6)));
				407	out[j++] = (byte) (0x80 \| (0x3F & codePoint));
				408	} else {
				409	// If we are surrogates and we're not a surrogate pair, always throw an
				410	// UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
				411	if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
				412	&& (i + 1 == in.length()
				413	\|\| !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
				414	throw new UnpairedSurrogateException(i, utf16Length);
				415	}
				416	throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j);
				417	}
				418	}
				419	return j;
				420	}
				421
				422	/**
				423	* Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
				424	*
				425	* <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
				426	* and the capabilities of the platform.
				427	*
				428	* @param in the source string to be encoded
				429	* @param out the target buffer to receive the encoded string.
				430	*/
				431	@Override
				432	public void encodeUtf8(CharSequence in, ByteBuffer out) {
				433	if (out.hasArray()) {
				434	int start = out.arrayOffset();
				435	int end = encodeUtf8Array(in, out.array(), start + out.position(),
				436	out.remaining());
				437	out.position(end - start);
				438	} else {
				439	encodeUtf8Buffer(in, out);
				440	}
				441	}
				442
				443	// These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with
				444	// a modification to throw a local exception. This exception can be caught
				445	// to fallback to more lenient behavior.
				446	static class UnpairedSurrogateException extends IllegalArgumentException {
				447	UnpairedSurrogateException(int index, int length) {
				448	super("Unpaired surrogate at index " + index + " of " + length);
				449	}
				450	}
				451	}