James Kuszmaul | 48dd4c8 | 2021-10-27 20:04:08 -0700 | [diff] [blame] | 1 | // Copyright 2008 Google Inc. All Rights Reserved. |
| 2 | // |
| 3 | // Redistribution and use in source and binary forms, with or without |
| 4 | // modification, are permitted provided that the following conditions are |
| 5 | // met: |
| 6 | // |
| 7 | // * Redistributions of source code must retain the above copyright |
| 8 | // notice, this list of conditions and the following disclaimer. |
| 9 | // * Redistributions in binary form must reproduce the above |
| 10 | // copyright notice, this list of conditions and the following disclaimer |
| 11 | // in the documentation and/or other materials provided with the |
| 12 | // distribution. |
| 13 | // * Neither the name of Google Inc. nor the names of its |
| 14 | // contributors may be used to endorse or promote products derived from |
| 15 | // this software without specific prior written permission. |
| 16 | // |
| 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 28 | // |
| 29 | // Internals shared between the Snappy implementation and its unittest. |
| 30 | |
| 31 | #ifndef THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_ |
| 32 | #define THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_ |
| 33 | |
| 34 | #include "snappy-stubs-internal.h" |
| 35 | |
| 36 | #if SNAPPY_HAVE_SSSE3 |
| 37 | // Please do not replace with <x86intrin.h> or with headers that assume more |
| 38 | // advanced SSE versions without checking with all the OWNERS. |
| 39 | #include <emmintrin.h> |
| 40 | #include <tmmintrin.h> |
| 41 | #endif |
| 42 | |
| 43 | #if SNAPPY_HAVE_NEON |
| 44 | #include <arm_neon.h> |
| 45 | #endif |
| 46 | |
| 47 | #if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON |
| 48 | #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1 |
| 49 | #else |
| 50 | #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0 |
| 51 | #endif |
| 52 | |
| 53 | namespace snappy { |
| 54 | namespace internal { |
| 55 | |
| 56 | #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE |
| 57 | #if SNAPPY_HAVE_SSSE3 |
| 58 | using V128 = __m128i; |
| 59 | #elif SNAPPY_HAVE_NEON |
| 60 | using V128 = uint8x16_t; |
| 61 | #endif |
| 62 | |
| 63 | // Load 128 bits of integer data. `src` must be 16-byte aligned. |
| 64 | inline V128 V128_Load(const V128* src); |
| 65 | |
| 66 | // Load 128 bits of integer data. `src` does not need to be aligned. |
| 67 | inline V128 V128_LoadU(const V128* src); |
| 68 | |
| 69 | // Store 128 bits of integer data. `dst` does not need to be aligned. |
| 70 | inline void V128_StoreU(V128* dst, V128 val); |
| 71 | |
| 72 | // Shuffle packed 8-bit integers using a shuffle mask. |
| 73 | // Each packed integer in the shuffle mask must be in [0,16). |
| 74 | inline V128 V128_Shuffle(V128 input, V128 shuffle_mask); |
| 75 | |
| 76 | // Constructs V128 with 16 chars |c|. |
| 77 | inline V128 V128_DupChar(char c); |
| 78 | |
| 79 | #if SNAPPY_HAVE_SSSE3 |
| 80 | inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } |
| 81 | |
| 82 | inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); } |
| 83 | |
| 84 | inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); } |
| 85 | |
| 86 | inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { |
| 87 | return _mm_shuffle_epi8(input, shuffle_mask); |
| 88 | } |
| 89 | |
| 90 | inline V128 V128_DupChar(char c) { return _mm_set1_epi8(c); } |
| 91 | |
| 92 | #elif SNAPPY_HAVE_NEON |
| 93 | inline V128 V128_Load(const V128* src) { |
| 94 | return vld1q_u8(reinterpret_cast<const uint8_t*>(src)); |
| 95 | } |
| 96 | |
| 97 | inline V128 V128_LoadU(const V128* src) { |
| 98 | return vld1q_u8(reinterpret_cast<const uint8_t*>(src)); |
| 99 | } |
| 100 | |
| 101 | inline void V128_StoreU(V128* dst, V128 val) { |
| 102 | vst1q_u8(reinterpret_cast<uint8_t*>(dst), val); |
| 103 | } |
| 104 | |
| 105 | inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { |
| 106 | assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15); |
| 107 | return vqtbl1q_u8(input, shuffle_mask); |
| 108 | } |
| 109 | |
| 110 | inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); } |
| 111 | #endif |
| 112 | #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE |
| 113 | |
| 114 | // Working memory performs a single allocation to hold all scratch space |
| 115 | // required for compression. |
| 116 | class WorkingMemory { |
| 117 | public: |
| 118 | explicit WorkingMemory(size_t input_size); |
| 119 | ~WorkingMemory(); |
| 120 | |
| 121 | // Allocates and clears a hash table using memory in "*this", |
| 122 | // stores the number of buckets in "*table_size" and returns a pointer to |
| 123 | // the base of the hash table. |
| 124 | uint16_t* GetHashTable(size_t fragment_size, int* table_size) const; |
| 125 | char* GetScratchInput() const { return input_; } |
| 126 | char* GetScratchOutput() const { return output_; } |
| 127 | |
| 128 | private: |
| 129 | char* mem_; // the allocated memory, never nullptr |
| 130 | size_t size_; // the size of the allocated memory, never 0 |
| 131 | uint16_t* table_; // the pointer to the hashtable |
| 132 | char* input_; // the pointer to the input scratch buffer |
| 133 | char* output_; // the pointer to the output scratch buffer |
| 134 | |
| 135 | // No copying |
| 136 | WorkingMemory(const WorkingMemory&); |
| 137 | void operator=(const WorkingMemory&); |
| 138 | }; |
| 139 | |
| 140 | // Flat array compression that does not emit the "uncompressed length" |
| 141 | // prefix. Compresses "input" string to the "*op" buffer. |
| 142 | // |
| 143 | // REQUIRES: "input_length <= kBlockSize" |
| 144 | // REQUIRES: "op" points to an array of memory that is at least |
| 145 | // "MaxCompressedLength(input_length)" in size. |
| 146 | // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero. |
| 147 | // REQUIRES: "table_size" is a power of two |
| 148 | // |
| 149 | // Returns an "end" pointer into "op" buffer. |
| 150 | // "end - op" is the compressed size of "input". |
| 151 | char* CompressFragment(const char* input, |
| 152 | size_t input_length, |
| 153 | char* op, |
| 154 | uint16_t* table, |
| 155 | const int table_size); |
| 156 | |
| 157 | // Find the largest n such that |
| 158 | // |
| 159 | // s1[0,n-1] == s2[0,n-1] |
| 160 | // and n <= (s2_limit - s2). |
| 161 | // |
| 162 | // Return make_pair(n, n < 8). |
| 163 | // Does not read *s2_limit or beyond. |
| 164 | // Does not read *(s1 + (s2_limit - s2)) or beyond. |
| 165 | // Requires that s2_limit >= s2. |
| 166 | // |
| 167 | // In addition populate *data with the next 5 bytes from the end of the match. |
| 168 | // This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is |
| 169 | // that on some arch's this can be done faster in this routine than subsequent |
| 170 | // loading from s2 + n. |
| 171 | // |
| 172 | // Separate implementation for 64-bit, little-endian cpus. |
| 173 | #if !SNAPPY_IS_BIG_ENDIAN && \ |
| 174 | (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \ |
| 175 | defined(ARCH_ARM)) |
| 176 | static inline std::pair<size_t, bool> FindMatchLength(const char* s1, |
| 177 | const char* s2, |
| 178 | const char* s2_limit, |
| 179 | uint64_t* data) { |
| 180 | assert(s2_limit >= s2); |
| 181 | size_t matched = 0; |
| 182 | |
| 183 | // This block isn't necessary for correctness; we could just start looping |
| 184 | // immediately. As an optimization though, it is useful. It creates some not |
| 185 | // uncommon code paths that determine, without extra effort, whether the match |
| 186 | // length is less than 8. In short, we are hoping to avoid a conditional |
| 187 | // branch, and perhaps get better code layout from the C++ compiler. |
| 188 | if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) { |
| 189 | uint64_t a1 = UNALIGNED_LOAD64(s1); |
| 190 | uint64_t a2 = UNALIGNED_LOAD64(s2); |
| 191 | if (SNAPPY_PREDICT_TRUE(a1 != a2)) { |
| 192 | // This code is critical for performance. The reason is that it determines |
| 193 | // how much to advance `ip` (s2). This obviously depends on both the loads |
| 194 | // from the `candidate` (s1) and `ip`. Furthermore the next `candidate` |
| 195 | // depends on the advanced `ip` calculated here through a load, hash and |
| 196 | // new candidate hash lookup (a lot of cycles). This makes s1 (ie. |
| 197 | // `candidate`) the variable that limits throughput. This is the reason we |
| 198 | // go through hoops to have this function update `data` for the next iter. |
| 199 | // The straightforward code would use *data, given by |
| 200 | // |
| 201 | // *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles), |
| 202 | // |
| 203 | // as input for the hash table lookup to find next candidate. However |
| 204 | // this forces the load on the data dependency chain of s1, because |
| 205 | // matched_bytes directly depends on s1. However matched_bytes is 0..7, so |
| 206 | // we can also calculate *data by |
| 207 | // |
| 208 | // *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8), |
| 209 | // matched_bytes); |
| 210 | // |
| 211 | // The loads do not depend on s1 anymore and are thus off the bottleneck. |
| 212 | // The straightforward implementation on x86_64 would be to use |
| 213 | // |
| 214 | // shrd rax, rdx, cl (cl being matched_bytes * 8) |
| 215 | // |
| 216 | // unfortunately shrd with a variable shift has a 4 cycle latency. So this |
| 217 | // only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable |
| 218 | // shift instruction but can only shift 64 bits. If we focus on just |
| 219 | // obtaining the least significant 4 bytes, we can obtain this by |
| 220 | // |
| 221 | // *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2), |
| 222 | // UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8); |
| 223 | // |
| 224 | // Writen like above this is not a big win, the conditional move would be |
| 225 | // a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle). |
| 226 | // However matched_bytes < 4 is equal to |
| 227 | // static_cast<uint32_t>(xorval) != 0. Writen that way, the conditional |
| 228 | // move (2 cycles) can execute in parallel with FindLSBSetNonZero64 |
| 229 | // (tzcnt), which takes 3 cycles. |
| 230 | uint64_t xorval = a1 ^ a2; |
| 231 | int shift = Bits::FindLSBSetNonZero64(xorval); |
| 232 | size_t matched_bytes = shift >> 3; |
| 233 | #ifndef __x86_64__ |
| 234 | *data = UNALIGNED_LOAD64(s2 + matched_bytes); |
| 235 | #else |
| 236 | // Ideally this would just be |
| 237 | // |
| 238 | // a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2; |
| 239 | // |
| 240 | // However clang correctly infers that the above statement participates on |
| 241 | // a critical data dependency chain and thus, unfortunately, refuses to |
| 242 | // use a conditional move (it's tuned to cut data dependencies). In this |
| 243 | // case there is a longer parallel chain anyway AND this will be fairly |
| 244 | // unpredictable. |
| 245 | uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); |
| 246 | asm("testl %k2, %k2\n\t" |
| 247 | "cmovzq %1, %0\n\t" |
| 248 | : "+r"(a2) |
| 249 | : "r"(a3), "r"(xorval)); |
| 250 | *data = a2 >> (shift & (3 * 8)); |
| 251 | #endif |
| 252 | return std::pair<size_t, bool>(matched_bytes, true); |
| 253 | } else { |
| 254 | matched = 8; |
| 255 | s2 += 8; |
| 256 | } |
| 257 | } |
| 258 | |
| 259 | // Find out how long the match is. We loop over the data 64 bits at a |
| 260 | // time until we find a 64-bit block that doesn't match; then we find |
| 261 | // the first non-matching bit and use that to calculate the total |
| 262 | // length of the match. |
| 263 | while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) { |
| 264 | uint64_t a1 = UNALIGNED_LOAD64(s1 + matched); |
| 265 | uint64_t a2 = UNALIGNED_LOAD64(s2); |
| 266 | if (a1 == a2) { |
| 267 | s2 += 8; |
| 268 | matched += 8; |
| 269 | } else { |
| 270 | uint64_t xorval = a1 ^ a2; |
| 271 | int shift = Bits::FindLSBSetNonZero64(xorval); |
| 272 | size_t matched_bytes = shift >> 3; |
| 273 | #ifndef __x86_64__ |
| 274 | *data = UNALIGNED_LOAD64(s2 + matched_bytes); |
| 275 | #else |
| 276 | uint64_t a3 = UNALIGNED_LOAD64(s2 + 4); |
| 277 | asm("testl %k2, %k2\n\t" |
| 278 | "cmovzq %1, %0\n\t" |
| 279 | : "+r"(a2) |
| 280 | : "r"(a3), "r"(xorval)); |
| 281 | *data = a2 >> (shift & (3 * 8)); |
| 282 | #endif |
| 283 | matched += matched_bytes; |
| 284 | assert(matched >= 8); |
| 285 | return std::pair<size_t, bool>(matched, false); |
| 286 | } |
| 287 | } |
| 288 | while (SNAPPY_PREDICT_TRUE(s2 < s2_limit)) { |
| 289 | if (s1[matched] == *s2) { |
| 290 | ++s2; |
| 291 | ++matched; |
| 292 | } else { |
| 293 | if (s2 <= s2_limit - 8) { |
| 294 | *data = UNALIGNED_LOAD64(s2); |
| 295 | } |
| 296 | return std::pair<size_t, bool>(matched, matched < 8); |
| 297 | } |
| 298 | } |
| 299 | return std::pair<size_t, bool>(matched, matched < 8); |
| 300 | } |
| 301 | #else |
| 302 | static inline std::pair<size_t, bool> FindMatchLength(const char* s1, |
| 303 | const char* s2, |
| 304 | const char* s2_limit, |
| 305 | uint64_t* data) { |
| 306 | // Implementation based on the x86-64 version, above. |
| 307 | assert(s2_limit >= s2); |
| 308 | int matched = 0; |
| 309 | |
| 310 | while (s2 <= s2_limit - 4 && |
| 311 | UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) { |
| 312 | s2 += 4; |
| 313 | matched += 4; |
| 314 | } |
| 315 | if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 4) { |
| 316 | uint32_t x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched); |
| 317 | int matching_bits = Bits::FindLSBSetNonZero(x); |
| 318 | matched += matching_bits >> 3; |
| 319 | s2 += matching_bits >> 3; |
| 320 | } else { |
| 321 | while ((s2 < s2_limit) && (s1[matched] == *s2)) { |
| 322 | ++s2; |
| 323 | ++matched; |
| 324 | } |
| 325 | } |
| 326 | if (s2 <= s2_limit - 8) *data = LittleEndian::Load64(s2); |
| 327 | return std::pair<size_t, bool>(matched, matched < 8); |
| 328 | } |
| 329 | #endif |
| 330 | |
| 331 | // Lookup tables for decompression code. Give --snappy_dump_decompression_table |
| 332 | // to the unit test to recompute char_table. |
| 333 | |
| 334 | enum { |
| 335 | LITERAL = 0, |
| 336 | COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode |
| 337 | COPY_2_BYTE_OFFSET = 2, |
| 338 | COPY_4_BYTE_OFFSET = 3 |
| 339 | }; |
| 340 | static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset. |
| 341 | |
| 342 | // Data stored per entry in lookup table: |
| 343 | // Range Bits-used Description |
| 344 | // ------------------------------------ |
| 345 | // 1..64 0..7 Literal/copy length encoded in opcode byte |
| 346 | // 0..7 8..10 Copy offset encoded in opcode byte / 256 |
| 347 | // 0..4 11..13 Extra bytes after opcode |
| 348 | // |
| 349 | // We use eight bits for the length even though 7 would have sufficed |
| 350 | // because of efficiency reasons: |
| 351 | // (1) Extracting a byte is faster than a bit-field |
| 352 | // (2) It properly aligns copy offset so we do not need a <<8 |
| 353 | static constexpr uint16_t char_table[256] = { |
| 354 | // clang-format off |
| 355 | 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002, |
| 356 | 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004, |
| 357 | 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006, |
| 358 | 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008, |
| 359 | 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a, |
| 360 | 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c, |
| 361 | 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e, |
| 362 | 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010, |
| 363 | 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012, |
| 364 | 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014, |
| 365 | 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016, |
| 366 | 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018, |
| 367 | 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a, |
| 368 | 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c, |
| 369 | 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e, |
| 370 | 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020, |
| 371 | 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022, |
| 372 | 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024, |
| 373 | 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026, |
| 374 | 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028, |
| 375 | 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a, |
| 376 | 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c, |
| 377 | 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e, |
| 378 | 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030, |
| 379 | 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032, |
| 380 | 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034, |
| 381 | 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036, |
| 382 | 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038, |
| 383 | 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a, |
| 384 | 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c, |
| 385 | 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e, |
| 386 | 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040, |
| 387 | // clang-format on |
| 388 | }; |
| 389 | |
| 390 | } // end namespace internal |
| 391 | } // end namespace snappy |
| 392 | |
| 393 | #endif // THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_ |