Blame - java/com/google/flatbuffers/Utf8.java - RealtimeRoboticsGroup/test

blob: e8af8ad6c7f9ad1cf066960dd5497f3448216f67 [file] [log] [blame]

Austin Schuh	e89fa2d	2019-08-14 20:24:23 -0700	[diff] [blame]	1	/*
				2	* Copyright 2014 Google Inc. All rights reserved.
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	package com.google.flatbuffers;
				18
				19	import java.nio.ByteBuffer;
				20
Austin Schuh	272c613	2020-11-14 16:37:52 -0800	[diff] [blame^]	21	import static java.lang.Character.MAX_SURROGATE;
				22	import static java.lang.Character.MIN_SURROGATE;
Austin Schuh	e89fa2d	2019-08-14 20:24:23 -0700	[diff] [blame]	23	import static java.lang.Character.MIN_HIGH_SURROGATE;
				24	import static java.lang.Character.MIN_LOW_SURROGATE;
				25	import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
Austin Schuh	272c613	2020-11-14 16:37:52 -0800	[diff] [blame^]	26	import static java.lang.Character.isSurrogatePair;
				27	import static java.lang.Character.toCodePoint;
Austin Schuh	e89fa2d	2019-08-14 20:24:23 -0700	[diff] [blame]	28
				29	public abstract class Utf8 {
				30
				31	/**
				32	* Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
				33	* this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
				34	* both time and space.
				35	*
				36	* @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
				37	* surrogates)
				38	*/
				39	public abstract int encodedLength(CharSequence sequence);
				40
				41	/**
				42	* Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
				43	*
				44	* <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
				45	* and the capabilities of the platform.
				46	*
				47	* @param in the source string to be encoded
				48	* @param out the target buffer to receive the encoded string.
				49	*/
				50	public abstract void encodeUtf8(CharSequence in, ByteBuffer out);
				51
				52	/**
				53	* Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
				54	*
				55	* @throws IllegalArgumentException if the input is not valid UTF-8.
				56	*/
				57	public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length);
				58
				59	private static Utf8 DEFAULT;
				60
				61	/**
				62	* Get the default UTF-8 processor.
				63	* @return the default processor
				64	*/
				65	public static Utf8 getDefault() {
				66	if (DEFAULT == null) {
				67	DEFAULT = new Utf8Safe();
				68	}
				69	return DEFAULT;
				70	}
				71
				72	/**
				73	* Set the default instance of the UTF-8 processor.
				74	* @param instance the new instance to use
				75	*/
				76	public static void setDefault(Utf8 instance) {
				77	DEFAULT = instance;
				78	}
				79
				80	/**
Austin Schuh	272c613	2020-11-14 16:37:52 -0800	[diff] [blame^]	81	* Encode a Java's CharSequence UTF8 codepoint into a byte array.
				82	* @param in CharSequence to be encoded
				83	* @param start start position of the first char in the codepoint
				84	* @param out byte array of 4 bytes to be filled
				85	* @return return the amount of bytes occupied by the codepoint
				86	*/
				87	public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) {
				88	// utf8 codepoint needs at least 4 bytes
				89	assert out.length >= 4;
				90
				91	final int inLength = in.length();
				92	if (start >= inLength) {
				93	return 0;
				94	}
				95
				96	char c = in.charAt(start);
				97	if (c < 0x80) {
				98	// One byte (0xxx xxxx)
				99	out[0] = (byte) c;
				100	return 1;
				101	} else if (c < 0x800) {
				102	// Two bytes (110x xxxx 10xx xxxx)
				103	out[0] = (byte) (0xC0 \| (c >>> 6));
				104	out[1] = (byte) (0x80 \| (0x3F & c));
				105	return 2;
				106	} else if (c < MIN_SURROGATE \|\| MAX_SURROGATE < c) {
				107	// Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
				108	// Maximum single-char code point is 0xFFFF, 16 bits.
				109	out[0] = (byte) (0xE0 \| (c >>> 12));
				110	out[1] =(byte) (0x80 \| (0x3F & (c >>> 6)));
				111	out[2] = (byte) (0x80 \| (0x3F & c));
				112	return 3;
				113	} else {
				114	// Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
				115	// Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
				116	// bytes
				117	final char low;
				118	if (start + 1 == inLength \|\| !isSurrogatePair(c, (low = in.charAt(start+1)))) {
				119	throw new UnpairedSurrogateException(start, inLength);
				120	}
				121	int codePoint = toCodePoint(c, low);
				122	out[0] = (byte) ((0xF << 4) \| (codePoint >>> 18));
				123	out[1] = (byte) (0x80 \| (0x3F & (codePoint >>> 12)));
				124	out[2] = (byte) (0x80 \| (0x3F & (codePoint >>> 6)));
				125	out[3] = (byte) (0x80 \| (0x3F & codePoint));
				126	return 4;
				127	}
				128	}
				129
				130	/**
Austin Schuh	e89fa2d	2019-08-14 20:24:23 -0700	[diff] [blame]	131	* Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
				132	* bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
				133	* checks and codepoint conversion happen in this class.
				134	*/
				135	static class DecodeUtil {
				136
				137	/**
				138	* Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
				139	*/
				140	static boolean isOneByte(byte b) {
				141	return b >= 0;
				142	}
				143
				144	/**
				145	* Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
				146	*/
				147	static boolean isTwoBytes(byte b) {
				148	return b < (byte) 0xE0;
				149	}
				150
				151	/**
				152	* Returns whether this is a three-byte codepoint with the form '110XXXXX'.
				153	*/
				154	static boolean isThreeBytes(byte b) {
				155	return b < (byte) 0xF0;
				156	}
				157
				158	static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
				159	resultArr[resultPos] = (char) byte1;
				160	}
				161
				162	static void handleTwoBytes(
				163	byte byte1, byte byte2, char[] resultArr, int resultPos)
				164	throws IllegalArgumentException {
				165	// Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
				166	// overlong 2-byte, '11000001'.
				167	if (byte1 < (byte) 0xC2) {
				168	throw new IllegalArgumentException("Invalid UTF-8: Illegal leading byte in 2 bytes utf");
				169	}
				170	if (isNotTrailingByte(byte2)) {
				171	throw new IllegalArgumentException("Invalid UTF-8: Illegal trailing byte in 2 bytes utf");
				172	}
				173	resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) \| trailingByteValue(byte2));
				174	}
				175
				176	static void handleThreeBytes(
				177	byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
				178	throws IllegalArgumentException {
				179	if (isNotTrailingByte(byte2)
				180	// overlong? 5 most significant bits must not all be zero
				181	\|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
				182	// check for illegal surrogate codepoints
				183	\|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
				184	\|\| isNotTrailingByte(byte3)) {
				185	throw new IllegalArgumentException("Invalid UTF-8");
				186	}
				187	resultArr[resultPos] = (char)
				188	(((byte1 & 0x0F) << 12) \| (trailingByteValue(byte2) << 6) \| trailingByteValue(byte3));
				189	}
				190
				191	static void handleFourBytes(
				192	byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
				193	throws IllegalArgumentException{
				194	if (isNotTrailingByte(byte2)
				195	// Check that 1 <= plane <= 16. Tricky optimized form of:
				196	// valid 4-byte leading byte?
				197	// if (byte1 > (byte) 0xF4 \|\|
				198	// overlong? 4 most significant bits must not all be zero
				199	// byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|
				200	// codepoint larger than the highest code point (U+10FFFF)?
				201	// byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
				202	\|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
				203	\|\| isNotTrailingByte(byte3)
				204	\|\| isNotTrailingByte(byte4)) {
				205	throw new IllegalArgumentException("Invalid UTF-8");
				206	}
				207	int codepoint = ((byte1 & 0x07) << 18)
				208	\| (trailingByteValue(byte2) << 12)
				209	\| (trailingByteValue(byte3) << 6)
				210	\| trailingByteValue(byte4);
				211	resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
				212	resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
				213	}
				214
				215	/**
				216	* Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
				217	*/
				218	private static boolean isNotTrailingByte(byte b) {
				219	return b > (byte) 0xBF;
				220	}
				221
				222	/**
				223	* Returns the actual value of the trailing byte (removes the prefix '10') for composition.
				224	*/
				225	private static int trailingByteValue(byte b) {
				226	return b & 0x3F;
				227	}
				228
				229	private static char highSurrogate(int codePoint) {
				230	return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
				231	+ (codePoint >>> 10));
				232	}
				233
				234	private static char lowSurrogate(int codePoint) {
				235	return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
				236	}
				237	}
				238
				239	// These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw
				240	// a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
				241	// fallback to more lenient behavior.
				242	static class UnpairedSurrogateException extends IllegalArgumentException {
				243	UnpairedSurrogateException(int index, int length) {
				244	super("Unpaired surrogate at index " + index + " of " + length);
				245	}
				246	}
				247	}