Blame - third_party/snappy/snappy-internal.h - RealtimeRoboticsGroup/test

blob: ae7ab5a33cd995994b8b63e0f0dd77a4146ab27e [file] [log] [blame]

James Kuszmaul	48dd4c8	2021-10-27 20:04:08 -0700	[diff] [blame]	1	// Copyright 2008 Google Inc. All Rights Reserved.
				2	//
				3	// Redistribution and use in source and binary forms, with or without
				4	// modification, are permitted provided that the following conditions are
				5	// met:
				6	//
				7	// * Redistributions of source code must retain the above copyright
				8	// notice, this list of conditions and the following disclaimer.
				9	// * Redistributions in binary form must reproduce the above
				10	// copyright notice, this list of conditions and the following disclaimer
				11	// in the documentation and/or other materials provided with the
				12	// distribution.
				13	// * Neither the name of Google Inc. nor the names of its
				14	// contributors may be used to endorse or promote products derived from
				15	// this software without specific prior written permission.
				16	//
				17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				18	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				19	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				20	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				21	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				22	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				23	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				24	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				25	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				26	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				27	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				28	//
				29	// Internals shared between the Snappy implementation and its unittest.
				30
				31	#ifndef THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
				32	#define THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
				33
				34	#include "snappy-stubs-internal.h"
				35
				36	#if SNAPPY_HAVE_SSSE3
				37	// Please do not replace with <x86intrin.h> or with headers that assume more
				38	// advanced SSE versions without checking with all the OWNERS.
				39	#include <emmintrin.h>
				40	#include <tmmintrin.h>
				41	#endif
				42
				43	#if SNAPPY_HAVE_NEON
				44	#include <arm_neon.h>
				45	#endif
				46
				47	#if SNAPPY_HAVE_SSSE3 \|\| SNAPPY_HAVE_NEON
				48	#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
				49	#else
				50	#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
				51	#endif
				52
				53	namespace snappy {
				54	namespace internal {
				55
				56	#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
				57	#if SNAPPY_HAVE_SSSE3
				58	using V128 = __m128i;
				59	#elif SNAPPY_HAVE_NEON
				60	using V128 = uint8x16_t;
				61	#endif
				62
				63	// Load 128 bits of integer data. `src` must be 16-byte aligned.
				64	inline V128 V128_Load(const V128* src);
				65
				66	// Load 128 bits of integer data. `src` does not need to be aligned.
				67	inline V128 V128_LoadU(const V128* src);
				68
				69	// Store 128 bits of integer data. `dst` does not need to be aligned.
				70	inline void V128_StoreU(V128* dst, V128 val);
				71
				72	// Shuffle packed 8-bit integers using a shuffle mask.
				73	// Each packed integer in the shuffle mask must be in [0,16).
				74	inline V128 V128_Shuffle(V128 input, V128 shuffle_mask);
				75
				76	// Constructs V128 with 16 chars \|c\|.
				77	inline V128 V128_DupChar(char c);
				78
				79	#if SNAPPY_HAVE_SSSE3
				80	inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
				81
				82	inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
				83
				84	inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); }
				85
				86	inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
				87	return _mm_shuffle_epi8(input, shuffle_mask);
				88	}
				89
				90	inline V128 V128_DupChar(char c) { return _mm_set1_epi8(c); }
				91
				92	#elif SNAPPY_HAVE_NEON
				93	inline V128 V128_Load(const V128* src) {
				94	return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
				95	}
				96
				97	inline V128 V128_LoadU(const V128* src) {
				98	return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
				99	}
				100
				101	inline void V128_StoreU(V128* dst, V128 val) {
				102	vst1q_u8(reinterpret_cast<uint8_t*>(dst), val);
				103	}
				104
				105	inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
				106	assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15);
				107	return vqtbl1q_u8(input, shuffle_mask);
				108	}
				109
				110	inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
				111	#endif
				112	#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
				113
				114	// Working memory performs a single allocation to hold all scratch space
				115	// required for compression.
				116	class WorkingMemory {
				117	public:
				118	explicit WorkingMemory(size_t input_size);
				119	~WorkingMemory();
				120
				121	// Allocates and clears a hash table using memory in "*this",
				122	// stores the number of buckets in "*table_size" and returns a pointer to
				123	// the base of the hash table.
				124	uint16_t* GetHashTable(size_t fragment_size, int* table_size) const;
				125	char* GetScratchInput() const { return input_; }
				126	char* GetScratchOutput() const { return output_; }
				127
				128	private:
				129	char* mem_; // the allocated memory, never nullptr
				130	size_t size_; // the size of the allocated memory, never 0
				131	uint16_t* table_; // the pointer to the hashtable
				132	char* input_; // the pointer to the input scratch buffer
				133	char* output_; // the pointer to the output scratch buffer
				134
				135	// No copying
				136	WorkingMemory(const WorkingMemory&);
				137	void operator=(const WorkingMemory&);
				138	};
				139
				140	// Flat array compression that does not emit the "uncompressed length"
				141	// prefix. Compresses "input" string to the "*op" buffer.
				142	//
				143	// REQUIRES: "input_length <= kBlockSize"
				144	// REQUIRES: "op" points to an array of memory that is at least
				145	// "MaxCompressedLength(input_length)" in size.
				146	// REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
				147	// REQUIRES: "table_size" is a power of two
				148	//
				149	// Returns an "end" pointer into "op" buffer.
				150	// "end - op" is the compressed size of "input".
				151	char* CompressFragment(const char* input,
				152	size_t input_length,
				153	char* op,
				154	uint16_t* table,
				155	const int table_size);
				156
				157	// Find the largest n such that
				158	//
				159	// s1[0,n-1] == s2[0,n-1]
				160	// and n <= (s2_limit - s2).
				161	//
				162	// Return make_pair(n, n < 8).
				163	// Does not read *s2_limit or beyond.
				164	// Does not read *(s1 + (s2_limit - s2)) or beyond.
				165	// Requires that s2_limit >= s2.
				166	//
				167	// In addition populate *data with the next 5 bytes from the end of the match.
				168	// This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is
				169	// that on some arch's this can be done faster in this routine than subsequent
				170	// loading from s2 + n.
				171	//
				172	// Separate implementation for 64-bit, little-endian cpus.
				173	#if !SNAPPY_IS_BIG_ENDIAN && \
				174	(defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(ARCH_PPC) \|\| \
				175	defined(ARCH_ARM))
				176	static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
				177	const char* s2,
				178	const char* s2_limit,
				179	uint64_t* data) {
				180	assert(s2_limit >= s2);
				181	size_t matched = 0;
				182
				183	// This block isn't necessary for correctness; we could just start looping
				184	// immediately. As an optimization though, it is useful. It creates some not
				185	// uncommon code paths that determine, without extra effort, whether the match
				186	// length is less than 8. In short, we are hoping to avoid a conditional
				187	// branch, and perhaps get better code layout from the C++ compiler.
				188	if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
				189	uint64_t a1 = UNALIGNED_LOAD64(s1);
				190	uint64_t a2 = UNALIGNED_LOAD64(s2);
				191	if (SNAPPY_PREDICT_TRUE(a1 != a2)) {
				192	// This code is critical for performance. The reason is that it determines
				193	// how much to advance `ip` (s2). This obviously depends on both the loads
				194	// from the `candidate` (s1) and `ip`. Furthermore the next `candidate`
				195	// depends on the advanced `ip` calculated here through a load, hash and
				196	// new candidate hash lookup (a lot of cycles). This makes s1 (ie.
				197	// `candidate`) the variable that limits throughput. This is the reason we
				198	// go through hoops to have this function update `data` for the next iter.
				199	// The straightforward code would use *data, given by
				200	//
				201	// *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles),
				202	//
				203	// as input for the hash table lookup to find next candidate. However
				204	// this forces the load on the data dependency chain of s1, because
				205	// matched_bytes directly depends on s1. However matched_bytes is 0..7, so
				206	// we can also calculate *data by
				207	//
				208	// *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8),
				209	// matched_bytes);
				210	//
				211	// The loads do not depend on s1 anymore and are thus off the bottleneck.
				212	// The straightforward implementation on x86_64 would be to use
				213	//
				214	// shrd rax, rdx, cl (cl being matched_bytes * 8)
				215	//
				216	// unfortunately shrd with a variable shift has a 4 cycle latency. So this
				217	// only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable
				218	// shift instruction but can only shift 64 bits. If we focus on just
				219	// obtaining the least significant 4 bytes, we can obtain this by
				220	//
				221	// *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2),
				222	// UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8);
				223	//
				224	// Writen like above this is not a big win, the conditional move would be
				225	// a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle).
				226	// However matched_bytes < 4 is equal to
				227	// static_cast<uint32_t>(xorval) != 0. Writen that way, the conditional
				228	// move (2 cycles) can execute in parallel with FindLSBSetNonZero64
				229	// (tzcnt), which takes 3 cycles.
				230	uint64_t xorval = a1 ^ a2;
				231	int shift = Bits::FindLSBSetNonZero64(xorval);
				232	size_t matched_bytes = shift >> 3;
				233	#ifndef __x86_64__
				234	*data = UNALIGNED_LOAD64(s2 + matched_bytes);
				235	#else
				236	// Ideally this would just be
				237	//
				238	// a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
				239	//
				240	// However clang correctly infers that the above statement participates on
				241	// a critical data dependency chain and thus, unfortunately, refuses to
				242	// use a conditional move (it's tuned to cut data dependencies). In this
				243	// case there is a longer parallel chain anyway AND this will be fairly
				244	// unpredictable.
				245	uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
				246	asm("testl %k2, %k2\n\t"
				247	"cmovzq %1, %0\n\t"
				248	: "+r"(a2)
				249	: "r"(a3), "r"(xorval));
				250	data = a2 >> (shift & (3 8));
				251	#endif
				252	return std::pair<size_t, bool>(matched_bytes, true);
				253	} else {
				254	matched = 8;
				255	s2 += 8;
				256	}
				257	}
				258
				259	// Find out how long the match is. We loop over the data 64 bits at a
				260	// time until we find a 64-bit block that doesn't match; then we find
				261	// the first non-matching bit and use that to calculate the total
				262	// length of the match.
				263	while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
				264	uint64_t a1 = UNALIGNED_LOAD64(s1 + matched);
				265	uint64_t a2 = UNALIGNED_LOAD64(s2);
				266	if (a1 == a2) {
				267	s2 += 8;
				268	matched += 8;
				269	} else {
				270	uint64_t xorval = a1 ^ a2;
				271	int shift = Bits::FindLSBSetNonZero64(xorval);
				272	size_t matched_bytes = shift >> 3;
				273	#ifndef __x86_64__
				274	*data = UNALIGNED_LOAD64(s2 + matched_bytes);
				275	#else
				276	uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
				277	asm("testl %k2, %k2\n\t"
				278	"cmovzq %1, %0\n\t"
				279	: "+r"(a2)
				280	: "r"(a3), "r"(xorval));
				281	data = a2 >> (shift & (3 8));
				282	#endif
				283	matched += matched_bytes;
				284	assert(matched >= 8);
				285	return std::pair<size_t, bool>(matched, false);
				286	}
				287	}
				288	while (SNAPPY_PREDICT_TRUE(s2 < s2_limit)) {
				289	if (s1[matched] == *s2) {
				290	++s2;
				291	++matched;
				292	} else {
				293	if (s2 <= s2_limit - 8) {
				294	*data = UNALIGNED_LOAD64(s2);
				295	}
				296	return std::pair<size_t, bool>(matched, matched < 8);
				297	}
				298	}
				299	return std::pair<size_t, bool>(matched, matched < 8);
				300	}
				301	#else
				302	static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
				303	const char* s2,
				304	const char* s2_limit,
				305	uint64_t* data) {
				306	// Implementation based on the x86-64 version, above.
				307	assert(s2_limit >= s2);
				308	int matched = 0;
				309
				310	while (s2 <= s2_limit - 4 &&
				311	UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
				312	s2 += 4;
				313	matched += 4;
				314	}
				315	if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 4) {
				316	uint32_t x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
				317	int matching_bits = Bits::FindLSBSetNonZero(x);
				318	matched += matching_bits >> 3;
				319	s2 += matching_bits >> 3;
				320	} else {
				321	while ((s2 < s2_limit) && (s1[matched] == *s2)) {
				322	++s2;
				323	++matched;
				324	}
				325	}
				326	if (s2 <= s2_limit - 8) *data = LittleEndian::Load64(s2);
				327	return std::pair<size_t, bool>(matched, matched < 8);
				328	}
				329	#endif
				330
				331	// Lookup tables for decompression code. Give --snappy_dump_decompression_table
				332	// to the unit test to recompute char_table.
				333
				334	enum {
				335	LITERAL = 0,
				336	COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode
				337	COPY_2_BYTE_OFFSET = 2,
				338	COPY_4_BYTE_OFFSET = 3
				339	};
				340	static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset.
				341
				342	// Data stored per entry in lookup table:
				343	// Range Bits-used Description
				344	// ------------------------------------
				345	// 1..64 0..7 Literal/copy length encoded in opcode byte
				346	// 0..7 8..10 Copy offset encoded in opcode byte / 256
				347	// 0..4 11..13 Extra bytes after opcode
				348	//
				349	// We use eight bits for the length even though 7 would have sufficed
				350	// because of efficiency reasons:
				351	// (1) Extracting a byte is faster than a bit-field
				352	// (2) It properly aligns copy offset so we do not need a <<8
				353	static constexpr uint16_t char_table[256] = {
				354	// clang-format off
				355	0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
				356	0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
				357	0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
				358	0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
				359	0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
				360	0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
				361	0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
				362	0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
				363	0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
				364	0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
				365	0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
				366	0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
				367	0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
				368	0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
				369	0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
				370	0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
				371	0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
				372	0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
				373	0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
				374	0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
				375	0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
				376	0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
				377	0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
				378	0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
				379	0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
				380	0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
				381	0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
				382	0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
				383	0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
				384	0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
				385	0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
				386	0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040,
				387	// clang-format on
				388	};
				389
				390	} // end namespace internal
				391	} // end namespace snappy
				392
				393	#endif // THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_