James Kuszmaul | 48dd4c8 | 2021-10-27 20:04:08 -0700 | [diff] [blame] | 1 | // Copyright 2020 Google Inc. All Rights Reserved. |
| 2 | // |
| 3 | // Redistribution and use in source and binary forms, with or without |
| 4 | // modification, are permitted provided that the following conditions are |
| 5 | // met: |
| 6 | // |
| 7 | // * Redistributions of source code must retain the above copyright |
| 8 | // notice, this list of conditions and the following disclaimer. |
| 9 | // * Redistributions in binary form must reproduce the above |
| 10 | // copyright notice, this list of conditions and the following disclaimer |
| 11 | // in the documentation and/or other materials provided with the |
| 12 | // distribution. |
| 13 | // * Neither the name of Google Inc. nor the names of its |
| 14 | // contributors may be used to endorse or promote products derived from |
| 15 | // this software without specific prior written permission. |
| 16 | // |
| 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 28 | |
| 29 | #include <algorithm> |
| 30 | #include <cmath> |
| 31 | #include <cstdint> |
| 32 | #include <cstdlib> |
| 33 | #include <random> |
| 34 | #include <string> |
| 35 | #include <utility> |
| 36 | #include <vector> |
| 37 | |
| 38 | #include "snappy-test.h" |
| 39 | |
| 40 | #include "snappy-internal.h" |
| 41 | #include "snappy-sinksource.h" |
| 42 | #include "snappy.h" |
| 43 | #include "snappy_test_data.h" |
| 44 | |
| 45 | SNAPPY_FLAG(int32_t, start_len, -1, |
| 46 | "Starting prefix size for testing (-1: just full file contents)"); |
| 47 | SNAPPY_FLAG(int32_t, end_len, -1, |
| 48 | "Starting prefix size for testing (-1: just full file contents)"); |
| 49 | SNAPPY_FLAG(int32_t, bytes, 10485760, |
| 50 | "How many bytes to compress/uncompress per file for timing"); |
| 51 | |
| 52 | SNAPPY_FLAG(bool, zlib, true, |
| 53 | "Run zlib compression (http://www.zlib.net)"); |
| 54 | SNAPPY_FLAG(bool, lzo, true, |
| 55 | "Run LZO compression (http://www.oberhumer.com/opensource/lzo/)"); |
| 56 | SNAPPY_FLAG(bool, lz4, true, |
| 57 | "Run LZ4 compression (https://github.com/lz4/lz4)"); |
| 58 | SNAPPY_FLAG(bool, snappy, true, "Run snappy compression"); |
| 59 | |
| 60 | SNAPPY_FLAG(bool, write_compressed, false, |
| 61 | "Write compressed versions of each file to <file>.comp"); |
| 62 | SNAPPY_FLAG(bool, write_uncompressed, false, |
| 63 | "Write uncompressed versions of each file to <file>.uncomp"); |
| 64 | |
| 65 | namespace snappy { |
| 66 | |
| 67 | namespace { |
| 68 | |
| 69 | #if HAVE_FUNC_MMAP && HAVE_FUNC_SYSCONF |
| 70 | |
| 71 | // To test against code that reads beyond its input, this class copies a |
| 72 | // string to a newly allocated group of pages, the last of which |
| 73 | // is made unreadable via mprotect. Note that we need to allocate the |
| 74 | // memory with mmap(), as POSIX allows mprotect() only on memory allocated |
| 75 | // with mmap(), and some malloc/posix_memalign implementations expect to |
| 76 | // be able to read previously allocated memory while doing heap allocations. |
| 77 | class DataEndingAtUnreadablePage { |
| 78 | public: |
| 79 | explicit DataEndingAtUnreadablePage(const std::string& s) { |
| 80 | const size_t page_size = sysconf(_SC_PAGESIZE); |
| 81 | const size_t size = s.size(); |
| 82 | // Round up space for string to a multiple of page_size. |
| 83 | size_t space_for_string = (size + page_size - 1) & ~(page_size - 1); |
| 84 | alloc_size_ = space_for_string + page_size; |
| 85 | mem_ = mmap(NULL, alloc_size_, |
| 86 | PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); |
| 87 | CHECK_NE(MAP_FAILED, mem_); |
| 88 | protected_page_ = reinterpret_cast<char*>(mem_) + space_for_string; |
| 89 | char* dst = protected_page_ - size; |
| 90 | std::memcpy(dst, s.data(), size); |
| 91 | data_ = dst; |
| 92 | size_ = size; |
| 93 | // Make guard page unreadable. |
| 94 | CHECK_EQ(0, mprotect(protected_page_, page_size, PROT_NONE)); |
| 95 | } |
| 96 | |
| 97 | ~DataEndingAtUnreadablePage() { |
| 98 | const size_t page_size = sysconf(_SC_PAGESIZE); |
| 99 | // Undo the mprotect. |
| 100 | CHECK_EQ(0, mprotect(protected_page_, page_size, PROT_READ|PROT_WRITE)); |
| 101 | CHECK_EQ(0, munmap(mem_, alloc_size_)); |
| 102 | } |
| 103 | |
| 104 | const char* data() const { return data_; } |
| 105 | size_t size() const { return size_; } |
| 106 | |
| 107 | private: |
| 108 | size_t alloc_size_; |
| 109 | void* mem_; |
| 110 | char* protected_page_; |
| 111 | const char* data_; |
| 112 | size_t size_; |
| 113 | }; |
| 114 | |
| 115 | #else // HAVE_FUNC_MMAP && HAVE_FUNC_SYSCONF |
| 116 | |
| 117 | // Fallback for systems without mmap. |
| 118 | using DataEndingAtUnreadablePage = std::string; |
| 119 | |
| 120 | #endif |
| 121 | |
| 122 | enum CompressorType { ZLIB, LZO, LZ4, SNAPPY }; |
| 123 | |
| 124 | const char* names[] = {"ZLIB", "LZO", "LZ4", "SNAPPY"}; |
| 125 | |
| 126 | size_t MinimumRequiredOutputSpace(size_t input_size, CompressorType comp) { |
| 127 | switch (comp) { |
| 128 | #ifdef ZLIB_VERSION |
| 129 | case ZLIB: |
| 130 | return ZLib::MinCompressbufSize(input_size); |
| 131 | #endif // ZLIB_VERSION |
| 132 | |
| 133 | #ifdef LZO_VERSION |
| 134 | case LZO: |
| 135 | return input_size + input_size/64 + 16 + 3; |
| 136 | #endif // LZO_VERSION |
| 137 | |
| 138 | #ifdef LZ4_VERSION_NUMBER |
| 139 | case LZ4: |
| 140 | return LZ4_compressBound(input_size); |
| 141 | #endif // LZ4_VERSION_NUMBER |
| 142 | |
| 143 | case SNAPPY: |
| 144 | return snappy::MaxCompressedLength(input_size); |
| 145 | |
| 146 | default: |
| 147 | LOG(FATAL) << "Unknown compression type number " << comp; |
| 148 | return 0; |
| 149 | } |
| 150 | } |
| 151 | |
| 152 | // Returns true if we successfully compressed, false otherwise. |
| 153 | // |
| 154 | // If compressed_is_preallocated is set, do not resize the compressed buffer. |
| 155 | // This is typically what you want for a benchmark, in order to not spend |
| 156 | // time in the memory allocator. If you do set this flag, however, |
| 157 | // "compressed" must be preinitialized to at least MinCompressbufSize(comp) |
| 158 | // number of bytes, and may contain junk bytes at the end after return. |
| 159 | bool Compress(const char* input, size_t input_size, CompressorType comp, |
| 160 | std::string* compressed, bool compressed_is_preallocated) { |
| 161 | if (!compressed_is_preallocated) { |
| 162 | compressed->resize(MinimumRequiredOutputSpace(input_size, comp)); |
| 163 | } |
| 164 | |
| 165 | switch (comp) { |
| 166 | #ifdef ZLIB_VERSION |
| 167 | case ZLIB: { |
| 168 | ZLib zlib; |
| 169 | uLongf destlen = compressed->size(); |
| 170 | int ret = zlib.Compress( |
| 171 | reinterpret_cast<Bytef*>(string_as_array(compressed)), |
| 172 | &destlen, |
| 173 | reinterpret_cast<const Bytef*>(input), |
| 174 | input_size); |
| 175 | CHECK_EQ(Z_OK, ret); |
| 176 | if (!compressed_is_preallocated) { |
| 177 | compressed->resize(destlen); |
| 178 | } |
| 179 | return true; |
| 180 | } |
| 181 | #endif // ZLIB_VERSION |
| 182 | |
| 183 | #ifdef LZO_VERSION |
| 184 | case LZO: { |
| 185 | unsigned char* mem = new unsigned char[LZO1X_1_15_MEM_COMPRESS]; |
| 186 | lzo_uint destlen; |
| 187 | int ret = lzo1x_1_15_compress( |
| 188 | reinterpret_cast<const uint8_t*>(input), |
| 189 | input_size, |
| 190 | reinterpret_cast<uint8_t*>(string_as_array(compressed)), |
| 191 | &destlen, |
| 192 | mem); |
| 193 | CHECK_EQ(LZO_E_OK, ret); |
| 194 | delete[] mem; |
| 195 | if (!compressed_is_preallocated) { |
| 196 | compressed->resize(destlen); |
| 197 | } |
| 198 | break; |
| 199 | } |
| 200 | #endif // LZO_VERSION |
| 201 | |
| 202 | #ifdef LZ4_VERSION_NUMBER |
| 203 | case LZ4: { |
| 204 | int destlen = compressed->size(); |
| 205 | destlen = LZ4_compress_default(input, string_as_array(compressed), |
| 206 | input_size, destlen); |
| 207 | CHECK_NE(destlen, 0); |
| 208 | if (!compressed_is_preallocated) { |
| 209 | compressed->resize(destlen); |
| 210 | } |
| 211 | break; |
| 212 | } |
| 213 | #endif // LZ4_VERSION_NUMBER |
| 214 | |
| 215 | case SNAPPY: { |
| 216 | size_t destlen; |
| 217 | snappy::RawCompress(input, input_size, |
| 218 | string_as_array(compressed), |
| 219 | &destlen); |
| 220 | CHECK_LE(destlen, snappy::MaxCompressedLength(input_size)); |
| 221 | if (!compressed_is_preallocated) { |
| 222 | compressed->resize(destlen); |
| 223 | } |
| 224 | break; |
| 225 | } |
| 226 | |
| 227 | default: { |
| 228 | return false; // the asked-for library wasn't compiled in |
| 229 | } |
| 230 | } |
| 231 | return true; |
| 232 | } |
| 233 | |
| 234 | bool Uncompress(const std::string& compressed, CompressorType comp, int size, |
| 235 | std::string* output) { |
| 236 | // TODO: Switch to [[maybe_unused]] when we can assume C++17. |
| 237 | (void)size; |
| 238 | switch (comp) { |
| 239 | #ifdef ZLIB_VERSION |
| 240 | case ZLIB: { |
| 241 | output->resize(size); |
| 242 | ZLib zlib; |
| 243 | uLongf destlen = output->size(); |
| 244 | int ret = zlib.Uncompress( |
| 245 | reinterpret_cast<Bytef*>(string_as_array(output)), |
| 246 | &destlen, |
| 247 | reinterpret_cast<const Bytef*>(compressed.data()), |
| 248 | compressed.size()); |
| 249 | CHECK_EQ(Z_OK, ret); |
| 250 | CHECK_EQ(static_cast<uLongf>(size), destlen); |
| 251 | break; |
| 252 | } |
| 253 | #endif // ZLIB_VERSION |
| 254 | |
| 255 | #ifdef LZO_VERSION |
| 256 | case LZO: { |
| 257 | output->resize(size); |
| 258 | lzo_uint destlen; |
| 259 | int ret = lzo1x_decompress( |
| 260 | reinterpret_cast<const uint8_t*>(compressed.data()), |
| 261 | compressed.size(), |
| 262 | reinterpret_cast<uint8_t*>(string_as_array(output)), |
| 263 | &destlen, |
| 264 | NULL); |
| 265 | CHECK_EQ(LZO_E_OK, ret); |
| 266 | CHECK_EQ(static_cast<lzo_uint>(size), destlen); |
| 267 | break; |
| 268 | } |
| 269 | #endif // LZO_VERSION |
| 270 | |
| 271 | #ifdef LZ4_VERSION_NUMBER |
| 272 | case LZ4: { |
| 273 | output->resize(size); |
| 274 | int destlen = output->size(); |
| 275 | destlen = LZ4_decompress_safe(compressed.data(), string_as_array(output), |
| 276 | compressed.size(), destlen); |
| 277 | CHECK_NE(destlen, 0); |
| 278 | CHECK_EQ(size, destlen); |
| 279 | break; |
| 280 | } |
| 281 | #endif // LZ4_VERSION_NUMBER |
| 282 | case SNAPPY: { |
| 283 | snappy::RawUncompress(compressed.data(), compressed.size(), |
| 284 | string_as_array(output)); |
| 285 | break; |
| 286 | } |
| 287 | |
| 288 | default: { |
| 289 | return false; // the asked-for library wasn't compiled in |
| 290 | } |
| 291 | } |
| 292 | return true; |
| 293 | } |
| 294 | |
| 295 | void Measure(const char* data, size_t length, CompressorType comp, int repeats, |
| 296 | int block_size) { |
| 297 | // Run tests a few time and pick median running times |
| 298 | static const int kRuns = 5; |
| 299 | double ctime[kRuns]; |
| 300 | double utime[kRuns]; |
| 301 | int compressed_size = 0; |
| 302 | |
| 303 | { |
| 304 | // Chop the input into blocks |
| 305 | int num_blocks = (length + block_size - 1) / block_size; |
| 306 | std::vector<const char*> input(num_blocks); |
| 307 | std::vector<size_t> input_length(num_blocks); |
| 308 | std::vector<std::string> compressed(num_blocks); |
| 309 | std::vector<std::string> output(num_blocks); |
| 310 | for (int b = 0; b < num_blocks; ++b) { |
| 311 | int input_start = b * block_size; |
| 312 | int input_limit = std::min<int>((b+1)*block_size, length); |
| 313 | input[b] = data+input_start; |
| 314 | input_length[b] = input_limit-input_start; |
| 315 | } |
| 316 | |
| 317 | // Pre-grow the output buffers so we don't measure string append time. |
| 318 | for (std::string& compressed_block : compressed) { |
| 319 | compressed_block.resize(MinimumRequiredOutputSpace(block_size, comp)); |
| 320 | } |
| 321 | |
| 322 | // First, try one trial compression to make sure the code is compiled in |
| 323 | if (!Compress(input[0], input_length[0], comp, &compressed[0], true)) { |
| 324 | LOG(WARNING) << "Skipping " << names[comp] << ": " |
| 325 | << "library not compiled in"; |
| 326 | return; |
| 327 | } |
| 328 | |
| 329 | for (int run = 0; run < kRuns; ++run) { |
| 330 | CycleTimer ctimer, utimer; |
| 331 | |
| 332 | // Pre-grow the output buffers so we don't measure string append time. |
| 333 | for (std::string& compressed_block : compressed) { |
| 334 | compressed_block.resize(MinimumRequiredOutputSpace(block_size, comp)); |
| 335 | } |
| 336 | |
| 337 | ctimer.Start(); |
| 338 | for (int b = 0; b < num_blocks; ++b) { |
| 339 | for (int i = 0; i < repeats; ++i) |
| 340 | Compress(input[b], input_length[b], comp, &compressed[b], true); |
| 341 | } |
| 342 | ctimer.Stop(); |
| 343 | |
| 344 | // Compress once more, with resizing, so we don't leave junk |
| 345 | // at the end that will confuse the decompressor. |
| 346 | for (int b = 0; b < num_blocks; ++b) { |
| 347 | Compress(input[b], input_length[b], comp, &compressed[b], false); |
| 348 | } |
| 349 | |
| 350 | for (int b = 0; b < num_blocks; ++b) { |
| 351 | output[b].resize(input_length[b]); |
| 352 | } |
| 353 | |
| 354 | utimer.Start(); |
| 355 | for (int i = 0; i < repeats; ++i) { |
| 356 | for (int b = 0; b < num_blocks; ++b) |
| 357 | Uncompress(compressed[b], comp, input_length[b], &output[b]); |
| 358 | } |
| 359 | utimer.Stop(); |
| 360 | |
| 361 | ctime[run] = ctimer.Get(); |
| 362 | utime[run] = utimer.Get(); |
| 363 | } |
| 364 | |
| 365 | compressed_size = 0; |
| 366 | for (const std::string& compressed_item : compressed) { |
| 367 | compressed_size += compressed_item.size(); |
| 368 | } |
| 369 | } |
| 370 | |
| 371 | std::sort(ctime, ctime + kRuns); |
| 372 | std::sort(utime, utime + kRuns); |
| 373 | const int med = kRuns/2; |
| 374 | |
| 375 | float comp_rate = (length / ctime[med]) * repeats / 1048576.0; |
| 376 | float uncomp_rate = (length / utime[med]) * repeats / 1048576.0; |
| 377 | std::string x = names[comp]; |
| 378 | x += ":"; |
| 379 | std::string urate = (uncomp_rate >= 0) ? StrFormat("%.1f", uncomp_rate) |
| 380 | : std::string("?"); |
| 381 | std::printf("%-7s [b %dM] bytes %6d -> %6d %4.1f%% " |
| 382 | "comp %5.1f MB/s uncomp %5s MB/s\n", |
| 383 | x.c_str(), |
| 384 | block_size/(1<<20), |
| 385 | static_cast<int>(length), static_cast<uint32_t>(compressed_size), |
| 386 | (compressed_size * 100.0) / std::max<int>(1, length), |
| 387 | comp_rate, |
| 388 | urate.c_str()); |
| 389 | } |
| 390 | |
| 391 | void CompressFile(const char* fname) { |
| 392 | std::string fullinput; |
| 393 | CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults())); |
| 394 | |
| 395 | std::string compressed; |
| 396 | Compress(fullinput.data(), fullinput.size(), SNAPPY, &compressed, false); |
| 397 | |
| 398 | CHECK_OK(file::SetContents(std::string(fname).append(".comp"), compressed, |
| 399 | file::Defaults())); |
| 400 | } |
| 401 | |
| 402 | void UncompressFile(const char* fname) { |
| 403 | std::string fullinput; |
| 404 | CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults())); |
| 405 | |
| 406 | size_t uncompLength; |
| 407 | CHECK(snappy::GetUncompressedLength(fullinput.data(), fullinput.size(), |
| 408 | &uncompLength)); |
| 409 | |
| 410 | std::string uncompressed; |
| 411 | uncompressed.resize(uncompLength); |
| 412 | CHECK(snappy::Uncompress(fullinput.data(), fullinput.size(), &uncompressed)); |
| 413 | |
| 414 | CHECK_OK(file::SetContents(std::string(fname).append(".uncomp"), uncompressed, |
| 415 | file::Defaults())); |
| 416 | } |
| 417 | |
| 418 | void MeasureFile(const char* fname) { |
| 419 | std::string fullinput; |
| 420 | CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults())); |
| 421 | std::printf("%-40s :\n", fname); |
| 422 | |
| 423 | int start_len = (snappy::GetFlag(FLAGS_start_len) < 0) |
| 424 | ? fullinput.size() |
| 425 | : snappy::GetFlag(FLAGS_start_len); |
| 426 | int end_len = fullinput.size(); |
| 427 | if (snappy::GetFlag(FLAGS_end_len) >= 0) { |
| 428 | end_len = std::min<int>(fullinput.size(), snappy::GetFlag(FLAGS_end_len)); |
| 429 | } |
| 430 | for (int len = start_len; len <= end_len; ++len) { |
| 431 | const char* const input = fullinput.data(); |
| 432 | int repeats = (snappy::GetFlag(FLAGS_bytes) + len) / (len + 1); |
| 433 | if (snappy::GetFlag(FLAGS_zlib)) |
| 434 | Measure(input, len, ZLIB, repeats, 1024 << 10); |
| 435 | if (snappy::GetFlag(FLAGS_lzo)) |
| 436 | Measure(input, len, LZO, repeats, 1024 << 10); |
| 437 | if (snappy::GetFlag(FLAGS_lz4)) |
| 438 | Measure(input, len, LZ4, repeats, 1024 << 10); |
| 439 | if (snappy::GetFlag(FLAGS_snappy)) |
| 440 | Measure(input, len, SNAPPY, repeats, 4096 << 10); |
| 441 | |
| 442 | // For block-size based measurements |
| 443 | if (0 && snappy::GetFlag(FLAGS_snappy)) { |
| 444 | Measure(input, len, SNAPPY, repeats, 8<<10); |
| 445 | Measure(input, len, SNAPPY, repeats, 16<<10); |
| 446 | Measure(input, len, SNAPPY, repeats, 32<<10); |
| 447 | Measure(input, len, SNAPPY, repeats, 64<<10); |
| 448 | Measure(input, len, SNAPPY, repeats, 256<<10); |
| 449 | Measure(input, len, SNAPPY, repeats, 1024<<10); |
| 450 | } |
| 451 | } |
| 452 | } |
| 453 | |
| 454 | } // namespace |
| 455 | |
| 456 | } // namespace snappy |
| 457 | |
| 458 | int main(int argc, char** argv) { |
| 459 | InitGoogle(argv[0], &argc, &argv, true); |
| 460 | |
| 461 | for (int arg = 1; arg < argc; ++arg) { |
| 462 | if (snappy::GetFlag(FLAGS_write_compressed)) { |
| 463 | snappy::CompressFile(argv[arg]); |
| 464 | } else if (snappy::GetFlag(FLAGS_write_uncompressed)) { |
| 465 | snappy::UncompressFile(argv[arg]); |
| 466 | } else { |
| 467 | snappy::MeasureFile(argv[arg]); |
| 468 | } |
| 469 | } |
| 470 | return 0; |
| 471 | } |