Blame - java/com/google/flatbuffers/Utf8.java - RealtimeRoboticsGroup/test

blob: efb6811f8c58013afaa1ea4c3e935cd21f95ea79 [file] [log] [blame]

Austin Schuh	e89fa2d	2019-08-14 20:24:23 -0700	[diff] [blame^]	1	/*
				2	* Copyright 2014 Google Inc. All rights reserved.
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	package com.google.flatbuffers;
				18
				19	import java.nio.ByteBuffer;
				20
				21	import static java.lang.Character.MIN_HIGH_SURROGATE;
				22	import static java.lang.Character.MIN_LOW_SURROGATE;
				23	import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
				24
				25	public abstract class Utf8 {
				26
				27	/**
				28	* Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
				29	* this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
				30	* both time and space.
				31	*
				32	* @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
				33	* surrogates)
				34	*/
				35	public abstract int encodedLength(CharSequence sequence);
				36
				37	/**
				38	* Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
				39	*
				40	* <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
				41	* and the capabilities of the platform.
				42	*
				43	* @param in the source string to be encoded
				44	* @param out the target buffer to receive the encoded string.
				45	*/
				46	public abstract void encodeUtf8(CharSequence in, ByteBuffer out);
				47
				48	/**
				49	* Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
				50	*
				51	* @throws IllegalArgumentException if the input is not valid UTF-8.
				52	*/
				53	public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length);
				54
				55	private static Utf8 DEFAULT;
				56
				57	/**
				58	* Get the default UTF-8 processor.
				59	* @return the default processor
				60	*/
				61	public static Utf8 getDefault() {
				62	if (DEFAULT == null) {
				63	DEFAULT = new Utf8Safe();
				64	}
				65	return DEFAULT;
				66	}
				67
				68	/**
				69	* Set the default instance of the UTF-8 processor.
				70	* @param instance the new instance to use
				71	*/
				72	public static void setDefault(Utf8 instance) {
				73	DEFAULT = instance;
				74	}
				75
				76	/**
				77	* Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
				78	* bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
				79	* checks and codepoint conversion happen in this class.
				80	*/
				81	static class DecodeUtil {
				82
				83	/**
				84	* Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
				85	*/
				86	static boolean isOneByte(byte b) {
				87	return b >= 0;
				88	}
				89
				90	/**
				91	* Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
				92	*/
				93	static boolean isTwoBytes(byte b) {
				94	return b < (byte) 0xE0;
				95	}
				96
				97	/**
				98	* Returns whether this is a three-byte codepoint with the form '110XXXXX'.
				99	*/
				100	static boolean isThreeBytes(byte b) {
				101	return b < (byte) 0xF0;
				102	}
				103
				104	static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
				105	resultArr[resultPos] = (char) byte1;
				106	}
				107
				108	static void handleTwoBytes(
				109	byte byte1, byte byte2, char[] resultArr, int resultPos)
				110	throws IllegalArgumentException {
				111	// Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
				112	// overlong 2-byte, '11000001'.
				113	if (byte1 < (byte) 0xC2) {
				114	throw new IllegalArgumentException("Invalid UTF-8: Illegal leading byte in 2 bytes utf");
				115	}
				116	if (isNotTrailingByte(byte2)) {
				117	throw new IllegalArgumentException("Invalid UTF-8: Illegal trailing byte in 2 bytes utf");
				118	}
				119	resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) \| trailingByteValue(byte2));
				120	}
				121
				122	static void handleThreeBytes(
				123	byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
				124	throws IllegalArgumentException {
				125	if (isNotTrailingByte(byte2)
				126	// overlong? 5 most significant bits must not all be zero
				127	\|\| (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
				128	// check for illegal surrogate codepoints
				129	\|\| (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
				130	\|\| isNotTrailingByte(byte3)) {
				131	throw new IllegalArgumentException("Invalid UTF-8");
				132	}
				133	resultArr[resultPos] = (char)
				134	(((byte1 & 0x0F) << 12) \| (trailingByteValue(byte2) << 6) \| trailingByteValue(byte3));
				135	}
				136
				137	static void handleFourBytes(
				138	byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
				139	throws IllegalArgumentException{
				140	if (isNotTrailingByte(byte2)
				141	// Check that 1 <= plane <= 16. Tricky optimized form of:
				142	// valid 4-byte leading byte?
				143	// if (byte1 > (byte) 0xF4 \|\|
				144	// overlong? 4 most significant bits must not all be zero
				145	// byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 \|\|
				146	// codepoint larger than the highest code point (U+10FFFF)?
				147	// byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
				148	\|\| (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
				149	\|\| isNotTrailingByte(byte3)
				150	\|\| isNotTrailingByte(byte4)) {
				151	throw new IllegalArgumentException("Invalid UTF-8");
				152	}
				153	int codepoint = ((byte1 & 0x07) << 18)
				154	\| (trailingByteValue(byte2) << 12)
				155	\| (trailingByteValue(byte3) << 6)
				156	\| trailingByteValue(byte4);
				157	resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
				158	resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
				159	}
				160
				161	/**
				162	* Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
				163	*/
				164	private static boolean isNotTrailingByte(byte b) {
				165	return b > (byte) 0xBF;
				166	}
				167
				168	/**
				169	* Returns the actual value of the trailing byte (removes the prefix '10') for composition.
				170	*/
				171	private static int trailingByteValue(byte b) {
				172	return b & 0x3F;
				173	}
				174
				175	private static char highSurrogate(int codePoint) {
				176	return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
				177	+ (codePoint >>> 10));
				178	}
				179
				180	private static char lowSurrogate(int codePoint) {
				181	return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
				182	}
				183	}
				184
				185	// These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw
				186	// a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
				187	// fallback to more lenient behavior.
				188	static class UnpairedSurrogateException extends IllegalArgumentException {
				189	UnpairedSurrogateException(int index, int length) {
				190	super("Unpaired surrogate at index " + index + " of " + length);
				191	}
				192	}
				193	}