Austin Schuh | 189376f | 2018-12-20 22:11:15 +1100 | [diff] [blame^] | 1 | // This file is part of Eigen, a lightweight C++ template library |
| 2 | // for linear algebra. |
| 3 | // |
| 4 | // Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com> |
| 5 | // |
| 6 | // This Source Code Form is subject to the terms of the Mozilla |
| 7 | // Public License v. 2.0. If a copy of the MPL was not distributed |
| 8 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 9 | |
| 10 | #include <iostream> |
| 11 | #include <cstdint> |
| 12 | #include <cstdlib> |
| 13 | #include <vector> |
| 14 | #include <fstream> |
| 15 | #include <memory> |
| 16 | #include <cstdio> |
| 17 | |
| 18 | bool eigen_use_specific_block_size; |
| 19 | int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n; |
| 20 | #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size |
| 21 | #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k |
| 22 | #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m |
| 23 | #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n |
| 24 | #include <Eigen/Core> |
| 25 | |
| 26 | #include <bench/BenchTimer.h> |
| 27 | |
| 28 | using namespace Eigen; |
| 29 | using namespace std; |
| 30 | |
| 31 | static BenchTimer timer; |
| 32 | |
| 33 | // how many times we repeat each measurement. |
| 34 | // measurements are randomly shuffled - we're not doing |
| 35 | // all N identical measurements in a row. |
| 36 | const int measurement_repetitions = 3; |
| 37 | |
| 38 | // Timings below this value are too short to be accurate, |
| 39 | // we'll repeat measurements with more iterations until |
| 40 | // we get a timing above that threshold. |
| 41 | const float min_accurate_time = 1e-2f; |
| 42 | |
| 43 | // See --min-working-set-size command line parameter. |
| 44 | size_t min_working_set_size = 0; |
| 45 | |
| 46 | float max_clock_speed = 0.0f; |
| 47 | |
| 48 | // range of sizes that we will benchmark (in all 3 K,M,N dimensions) |
| 49 | const size_t maxsize = 2048; |
| 50 | const size_t minsize = 16; |
| 51 | |
| 52 | typedef MatrixXf MatrixType; |
| 53 | typedef MatrixType::Scalar Scalar; |
| 54 | typedef internal::packet_traits<Scalar>::type Packet; |
| 55 | |
| 56 | static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two"); |
| 57 | static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two"); |
| 58 | static_assert(maxsize > minsize, "maxsize must be larger than minsize"); |
| 59 | static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)"); |
| 60 | |
| 61 | // just a helper to store a triple of K,M,N sizes for matrix product |
| 62 | struct size_triple_t |
| 63 | { |
| 64 | size_t k, m, n; |
| 65 | size_triple_t() : k(0), m(0), n(0) {} |
| 66 | size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {} |
| 67 | size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {} |
| 68 | size_triple_t(uint16_t compact) |
| 69 | { |
| 70 | k = 1 << ((compact & 0xf00) >> 8); |
| 71 | m = 1 << ((compact & 0x0f0) >> 4); |
| 72 | n = 1 << ((compact & 0x00f) >> 0); |
| 73 | } |
| 74 | }; |
| 75 | |
| 76 | uint8_t log2_pot(size_t x) { |
| 77 | size_t l = 0; |
| 78 | while (x >>= 1) l++; |
| 79 | return l; |
| 80 | } |
| 81 | |
| 82 | // Convert between size tripes and a compact form fitting in 12 bits |
| 83 | // where each size, which must be a POT, is encoded as its log2, on 4 bits |
| 84 | // so the largest representable size is 2^15 == 32k ... big enough. |
| 85 | uint16_t compact_size_triple(size_t k, size_t m, size_t n) |
| 86 | { |
| 87 | return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n); |
| 88 | } |
| 89 | |
| 90 | uint16_t compact_size_triple(const size_triple_t& t) |
| 91 | { |
| 92 | return compact_size_triple(t.k, t.m, t.n); |
| 93 | } |
| 94 | |
| 95 | // A single benchmark. Initially only contains benchmark params. |
| 96 | // Then call run(), which stores the result in the gflops field. |
| 97 | struct benchmark_t |
| 98 | { |
| 99 | uint16_t compact_product_size; |
| 100 | uint16_t compact_block_size; |
| 101 | bool use_default_block_size; |
| 102 | float gflops; |
| 103 | benchmark_t() |
| 104 | : compact_product_size(0) |
| 105 | , compact_block_size(0) |
| 106 | , use_default_block_size(false) |
| 107 | , gflops(0) |
| 108 | { |
| 109 | } |
| 110 | benchmark_t(size_t pk, size_t pm, size_t pn, |
| 111 | size_t bk, size_t bm, size_t bn) |
| 112 | : compact_product_size(compact_size_triple(pk, pm, pn)) |
| 113 | , compact_block_size(compact_size_triple(bk, bm, bn)) |
| 114 | , use_default_block_size(false) |
| 115 | , gflops(0) |
| 116 | {} |
| 117 | benchmark_t(size_t pk, size_t pm, size_t pn) |
| 118 | : compact_product_size(compact_size_triple(pk, pm, pn)) |
| 119 | , compact_block_size(0) |
| 120 | , use_default_block_size(true) |
| 121 | , gflops(0) |
| 122 | {} |
| 123 | |
| 124 | void run(); |
| 125 | }; |
| 126 | |
| 127 | ostream& operator<<(ostream& s, const benchmark_t& b) |
| 128 | { |
| 129 | s << hex << b.compact_product_size << dec; |
| 130 | if (b.use_default_block_size) { |
| 131 | size_triple_t t(b.compact_product_size); |
| 132 | Index k = t.k, m = t.m, n = t.n; |
| 133 | internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n); |
| 134 | s << " default(" << k << ", " << m << ", " << n << ")"; |
| 135 | } else { |
| 136 | s << " " << hex << b.compact_block_size << dec; |
| 137 | } |
| 138 | s << " " << b.gflops; |
| 139 | return s; |
| 140 | } |
| 141 | |
| 142 | // We sort first by increasing benchmark parameters, |
| 143 | // then by decreasing performance. |
| 144 | bool operator<(const benchmark_t& b1, const benchmark_t& b2) |
| 145 | { |
| 146 | return b1.compact_product_size < b2.compact_product_size || |
| 147 | (b1.compact_product_size == b2.compact_product_size && ( |
| 148 | (b1.compact_block_size < b2.compact_block_size || ( |
| 149 | b1.compact_block_size == b2.compact_block_size && |
| 150 | b1.gflops > b2.gflops)))); |
| 151 | } |
| 152 | |
| 153 | void benchmark_t::run() |
| 154 | { |
| 155 | size_triple_t productsizes(compact_product_size); |
| 156 | |
| 157 | if (use_default_block_size) { |
| 158 | eigen_use_specific_block_size = false; |
| 159 | } else { |
| 160 | // feed eigen with our custom blocking params |
| 161 | eigen_use_specific_block_size = true; |
| 162 | size_triple_t blocksizes(compact_block_size); |
| 163 | eigen_block_size_k = blocksizes.k; |
| 164 | eigen_block_size_m = blocksizes.m; |
| 165 | eigen_block_size_n = blocksizes.n; |
| 166 | } |
| 167 | |
| 168 | // set up the matrix pool |
| 169 | |
| 170 | const size_t combined_three_matrices_sizes = |
| 171 | sizeof(Scalar) * |
| 172 | (productsizes.k * productsizes.m + |
| 173 | productsizes.k * productsizes.n + |
| 174 | productsizes.m * productsizes.n); |
| 175 | |
| 176 | // 64 M is large enough that nobody has a cache bigger than that, |
| 177 | // while still being small enough that everybody has this much RAM, |
| 178 | // so conveniently we don't need to special-case platforms here. |
| 179 | const size_t unlikely_large_cache_size = 64 << 20; |
| 180 | |
| 181 | const size_t working_set_size = |
| 182 | min_working_set_size ? min_working_set_size : unlikely_large_cache_size; |
| 183 | |
| 184 | const size_t matrix_pool_size = |
| 185 | 1 + working_set_size / combined_three_matrices_sizes; |
| 186 | |
| 187 | MatrixType *lhs = new MatrixType[matrix_pool_size]; |
| 188 | MatrixType *rhs = new MatrixType[matrix_pool_size]; |
| 189 | MatrixType *dst = new MatrixType[matrix_pool_size]; |
| 190 | |
| 191 | for (size_t i = 0; i < matrix_pool_size; i++) { |
| 192 | lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k); |
| 193 | rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n); |
| 194 | dst[i] = MatrixType::Zero(productsizes.m, productsizes.n); |
| 195 | } |
| 196 | |
| 197 | // main benchmark loop |
| 198 | |
| 199 | int iters_at_a_time = 1; |
| 200 | float time_per_iter = 0.0f; |
| 201 | size_t matrix_index = 0; |
| 202 | while (true) { |
| 203 | |
| 204 | double starttime = timer.getCpuTime(); |
| 205 | for (int i = 0; i < iters_at_a_time; i++) { |
| 206 | dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index]; |
| 207 | matrix_index++; |
| 208 | if (matrix_index == matrix_pool_size) { |
| 209 | matrix_index = 0; |
| 210 | } |
| 211 | } |
| 212 | double endtime = timer.getCpuTime(); |
| 213 | |
| 214 | const float timing = float(endtime - starttime); |
| 215 | |
| 216 | if (timing >= min_accurate_time) { |
| 217 | time_per_iter = timing / iters_at_a_time; |
| 218 | break; |
| 219 | } |
| 220 | |
| 221 | iters_at_a_time *= 2; |
| 222 | } |
| 223 | |
| 224 | delete[] lhs; |
| 225 | delete[] rhs; |
| 226 | delete[] dst; |
| 227 | |
| 228 | gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter; |
| 229 | } |
| 230 | |
| 231 | void print_cpuinfo() |
| 232 | { |
| 233 | #ifdef __linux__ |
| 234 | cout << "contents of /proc/cpuinfo:" << endl; |
| 235 | string line; |
| 236 | ifstream cpuinfo("/proc/cpuinfo"); |
| 237 | if (cpuinfo.is_open()) { |
| 238 | while (getline(cpuinfo, line)) { |
| 239 | cout << line << endl; |
| 240 | } |
| 241 | cpuinfo.close(); |
| 242 | } |
| 243 | cout << endl; |
| 244 | #elif defined __APPLE__ |
| 245 | cout << "output of sysctl hw:" << endl; |
| 246 | system("sysctl hw"); |
| 247 | cout << endl; |
| 248 | #endif |
| 249 | } |
| 250 | |
| 251 | template <typename T> |
| 252 | string type_name() |
| 253 | { |
| 254 | return "unknown"; |
| 255 | } |
| 256 | |
| 257 | template<> |
| 258 | string type_name<float>() |
| 259 | { |
| 260 | return "float"; |
| 261 | } |
| 262 | |
| 263 | template<> |
| 264 | string type_name<double>() |
| 265 | { |
| 266 | return "double"; |
| 267 | } |
| 268 | |
| 269 | struct action_t |
| 270 | { |
| 271 | virtual const char* invokation_name() const { abort(); return nullptr; } |
| 272 | virtual void run() const { abort(); } |
| 273 | virtual ~action_t() {} |
| 274 | }; |
| 275 | |
| 276 | void show_usage_and_exit(int /*argc*/, char* argv[], |
| 277 | const vector<unique_ptr<action_t>>& available_actions) |
| 278 | { |
| 279 | cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl; |
| 280 | cerr << "available actions:" << endl << endl; |
| 281 | for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { |
| 282 | cerr << " " << (*it)->invokation_name() << endl; |
| 283 | } |
| 284 | cerr << endl; |
| 285 | cerr << "options:" << endl << endl; |
| 286 | cerr << " --min-working-set-size=N:" << endl; |
| 287 | cerr << " Set the minimum working set size to N bytes." << endl; |
| 288 | cerr << " This is rounded up as needed to a multiple of matrix size." << endl; |
| 289 | cerr << " A larger working set lowers the chance of a warm cache." << endl; |
| 290 | cerr << " The default value 0 means use a large enough working" << endl; |
| 291 | cerr << " set to likely outsize caches." << endl; |
| 292 | cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl; |
| 293 | cerr << " avoid warm caches." << endl; |
| 294 | exit(1); |
| 295 | } |
| 296 | |
| 297 | float measure_clock_speed() |
| 298 | { |
| 299 | cerr << "Measuring clock speed... \r" << flush; |
| 300 | |
| 301 | vector<float> all_gflops; |
| 302 | for (int i = 0; i < 8; i++) { |
| 303 | benchmark_t b(1024, 1024, 1024); |
| 304 | b.run(); |
| 305 | all_gflops.push_back(b.gflops); |
| 306 | } |
| 307 | |
| 308 | sort(all_gflops.begin(), all_gflops.end()); |
| 309 | float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5]; |
| 310 | |
| 311 | // multiply by an arbitrary constant to discourage trying doing anything with the |
| 312 | // returned values besides just comparing them with each other. |
| 313 | float result = stable_estimate * 123.456f; |
| 314 | |
| 315 | return result; |
| 316 | } |
| 317 | |
| 318 | struct human_duration_t |
| 319 | { |
| 320 | int seconds; |
| 321 | human_duration_t(int s) : seconds(s) {} |
| 322 | }; |
| 323 | |
| 324 | ostream& operator<<(ostream& s, const human_duration_t& d) |
| 325 | { |
| 326 | int remainder = d.seconds; |
| 327 | if (remainder > 3600) { |
| 328 | int hours = remainder / 3600; |
| 329 | s << hours << " h "; |
| 330 | remainder -= hours * 3600; |
| 331 | } |
| 332 | if (remainder > 60) { |
| 333 | int minutes = remainder / 60; |
| 334 | s << minutes << " min "; |
| 335 | remainder -= minutes * 60; |
| 336 | } |
| 337 | if (d.seconds < 600) { |
| 338 | s << remainder << " s"; |
| 339 | } |
| 340 | return s; |
| 341 | } |
| 342 | |
| 343 | const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data"; |
| 344 | |
| 345 | void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run) |
| 346 | { |
| 347 | FILE* file = fopen(filename, "w"); |
| 348 | if (!file) { |
| 349 | cerr << "Could not open file " << filename << " for writing." << endl; |
| 350 | cerr << "Do you have write permissions on the current working directory?" << endl; |
| 351 | exit(1); |
| 352 | } |
| 353 | size_t benchmarks_vector_size = benchmarks.size(); |
| 354 | fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file); |
| 355 | fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file); |
| 356 | fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file); |
| 357 | fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file); |
| 358 | fclose(file); |
| 359 | } |
| 360 | |
| 361 | bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run) |
| 362 | { |
| 363 | FILE* file = fopen(filename, "r"); |
| 364 | if (!file) { |
| 365 | return false; |
| 366 | } |
| 367 | if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) { |
| 368 | return false; |
| 369 | } |
| 370 | size_t benchmarks_vector_size = 0; |
| 371 | if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) { |
| 372 | return false; |
| 373 | } |
| 374 | if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) { |
| 375 | return false; |
| 376 | } |
| 377 | benchmarks.resize(benchmarks_vector_size); |
| 378 | if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) { |
| 379 | return false; |
| 380 | } |
| 381 | unlink(filename); |
| 382 | return true; |
| 383 | } |
| 384 | |
| 385 | void try_run_some_benchmarks( |
| 386 | vector<benchmark_t>& benchmarks, |
| 387 | double time_start, |
| 388 | size_t& first_benchmark_to_run) |
| 389 | { |
| 390 | if (first_benchmark_to_run == benchmarks.size()) { |
| 391 | return; |
| 392 | } |
| 393 | |
| 394 | double time_last_progress_update = 0; |
| 395 | double time_last_clock_speed_measurement = 0; |
| 396 | double time_now = 0; |
| 397 | |
| 398 | size_t benchmark_index = first_benchmark_to_run; |
| 399 | |
| 400 | while (true) { |
| 401 | float ratio_done = float(benchmark_index) / benchmarks.size(); |
| 402 | time_now = timer.getRealTime(); |
| 403 | |
| 404 | // We check clock speed every minute and at the end. |
| 405 | if (benchmark_index == benchmarks.size() || |
| 406 | time_now > time_last_clock_speed_measurement + 60.0f) |
| 407 | { |
| 408 | time_last_clock_speed_measurement = time_now; |
| 409 | |
| 410 | // Ensure that clock speed is as expected |
| 411 | float current_clock_speed = measure_clock_speed(); |
| 412 | |
| 413 | // The tolerance needs to be smaller than the relative difference between |
| 414 | // clock speeds that a device could operate under. |
| 415 | // It seems unlikely that a device would be throttling clock speeds by |
| 416 | // amounts smaller than 2%. |
| 417 | // With a value of 1%, I was getting within noise on a Sandy Bridge. |
| 418 | const float clock_speed_tolerance = 0.02f; |
| 419 | |
| 420 | if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) { |
| 421 | // Clock speed is now higher than we previously measured. |
| 422 | // Either our initial measurement was inaccurate, which won't happen |
| 423 | // too many times as we are keeping the best clock speed value and |
| 424 | // and allowing some tolerance; or something really weird happened, |
| 425 | // which invalidates all benchmark results collected so far. |
| 426 | // Either way, we better restart all over again now. |
| 427 | if (benchmark_index) { |
| 428 | cerr << "Restarting at " << 100.0f * ratio_done |
| 429 | << " % because clock speed increased. " << endl; |
| 430 | } |
| 431 | max_clock_speed = current_clock_speed; |
| 432 | first_benchmark_to_run = 0; |
| 433 | return; |
| 434 | } |
| 435 | |
| 436 | bool rerun_last_tests = false; |
| 437 | |
| 438 | if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { |
| 439 | cerr << "Measurements completed so far: " |
| 440 | << 100.0f * ratio_done |
| 441 | << " % " << endl; |
| 442 | cerr << "Clock speed seems to be only " |
| 443 | << current_clock_speed/max_clock_speed |
| 444 | << " times what it used to be." << endl; |
| 445 | |
| 446 | unsigned int seconds_to_sleep_if_lower_clock_speed = 1; |
| 447 | |
| 448 | while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { |
| 449 | if (seconds_to_sleep_if_lower_clock_speed > 32) { |
| 450 | cerr << "Sleeping longer probably won't make a difference." << endl; |
| 451 | cerr << "Serializing benchmarks to " << session_filename << endl; |
| 452 | serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run); |
| 453 | cerr << "Now restart this benchmark, and it should pick up where we left." << endl; |
| 454 | exit(2); |
| 455 | } |
| 456 | rerun_last_tests = true; |
| 457 | cerr << "Sleeping " |
| 458 | << seconds_to_sleep_if_lower_clock_speed |
| 459 | << " s... \r" << endl; |
| 460 | sleep(seconds_to_sleep_if_lower_clock_speed); |
| 461 | current_clock_speed = measure_clock_speed(); |
| 462 | seconds_to_sleep_if_lower_clock_speed *= 2; |
| 463 | } |
| 464 | } |
| 465 | |
| 466 | if (rerun_last_tests) { |
| 467 | cerr << "Redoing the last " |
| 468 | << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size() |
| 469 | << " % because clock speed had been low. " << endl; |
| 470 | return; |
| 471 | } |
| 472 | |
| 473 | // nothing wrong with the clock speed so far, so there won't be a need to rerun |
| 474 | // benchmarks run so far in case we later encounter a lower clock speed. |
| 475 | first_benchmark_to_run = benchmark_index; |
| 476 | } |
| 477 | |
| 478 | if (benchmark_index == benchmarks.size()) { |
| 479 | // We're done! |
| 480 | first_benchmark_to_run = benchmarks.size(); |
| 481 | // Erase progress info |
| 482 | cerr << " " << endl; |
| 483 | return; |
| 484 | } |
| 485 | |
| 486 | // Display progress info on stderr |
| 487 | if (time_now > time_last_progress_update + 1.0f) { |
| 488 | time_last_progress_update = time_now; |
| 489 | cerr << "Measurements... " << 100.0f * ratio_done |
| 490 | << " %, ETA " |
| 491 | << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done) |
| 492 | << " \r" << flush; |
| 493 | } |
| 494 | |
| 495 | // This is where we actually run a benchmark! |
| 496 | benchmarks[benchmark_index].run(); |
| 497 | benchmark_index++; |
| 498 | } |
| 499 | } |
| 500 | |
| 501 | void run_benchmarks(vector<benchmark_t>& benchmarks) |
| 502 | { |
| 503 | size_t first_benchmark_to_run; |
| 504 | vector<benchmark_t> deserialized_benchmarks; |
| 505 | bool use_deserialized_benchmarks = false; |
| 506 | if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) { |
| 507 | cerr << "Found serialized session with " |
| 508 | << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size() |
| 509 | << " % already done" << endl; |
| 510 | if (deserialized_benchmarks.size() == benchmarks.size() && |
| 511 | first_benchmark_to_run > 0 && |
| 512 | first_benchmark_to_run < benchmarks.size()) |
| 513 | { |
| 514 | use_deserialized_benchmarks = true; |
| 515 | } |
| 516 | } |
| 517 | |
| 518 | if (use_deserialized_benchmarks) { |
| 519 | benchmarks = deserialized_benchmarks; |
| 520 | } else { |
| 521 | // not using deserialized benchmarks, starting from scratch |
| 522 | first_benchmark_to_run = 0; |
| 523 | |
| 524 | // Randomly shuffling benchmarks allows us to get accurate enough progress info, |
| 525 | // as now the cheap/expensive benchmarks are randomly mixed so they average out. |
| 526 | // It also means that if data is corrupted for some time span, the odds are that |
| 527 | // not all repetitions of a given benchmark will be corrupted. |
| 528 | random_shuffle(benchmarks.begin(), benchmarks.end()); |
| 529 | } |
| 530 | |
| 531 | for (int i = 0; i < 4; i++) { |
| 532 | max_clock_speed = max(max_clock_speed, measure_clock_speed()); |
| 533 | } |
| 534 | |
| 535 | double time_start = 0.0; |
| 536 | while (first_benchmark_to_run < benchmarks.size()) { |
| 537 | if (first_benchmark_to_run == 0) { |
| 538 | time_start = timer.getRealTime(); |
| 539 | } |
| 540 | try_run_some_benchmarks(benchmarks, |
| 541 | time_start, |
| 542 | first_benchmark_to_run); |
| 543 | } |
| 544 | |
| 545 | // Sort timings by increasing benchmark parameters, and decreasing gflops. |
| 546 | // The latter is very important. It means that we can ignore all but the first |
| 547 | // benchmark with given parameters. |
| 548 | sort(benchmarks.begin(), benchmarks.end()); |
| 549 | |
| 550 | // Collect best (i.e. now first) results for each parameter values. |
| 551 | vector<benchmark_t> best_benchmarks; |
| 552 | for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { |
| 553 | if (best_benchmarks.empty() || |
| 554 | best_benchmarks.back().compact_product_size != it->compact_product_size || |
| 555 | best_benchmarks.back().compact_block_size != it->compact_block_size) |
| 556 | { |
| 557 | best_benchmarks.push_back(*it); |
| 558 | } |
| 559 | } |
| 560 | |
| 561 | // keep and return only the best benchmarks |
| 562 | benchmarks = best_benchmarks; |
| 563 | } |
| 564 | |
| 565 | struct measure_all_pot_sizes_action_t : action_t |
| 566 | { |
| 567 | virtual const char* invokation_name() const { return "all-pot-sizes"; } |
| 568 | virtual void run() const |
| 569 | { |
| 570 | vector<benchmark_t> benchmarks; |
| 571 | for (int repetition = 0; repetition < measurement_repetitions; repetition++) { |
| 572 | for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { |
| 573 | for (size_t msize = minsize; msize <= maxsize; msize *= 2) { |
| 574 | for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { |
| 575 | for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) { |
| 576 | for (size_t mblock = minsize; mblock <= msize; mblock *= 2) { |
| 577 | for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) { |
| 578 | benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock); |
| 579 | } |
| 580 | } |
| 581 | } |
| 582 | } |
| 583 | } |
| 584 | } |
| 585 | } |
| 586 | |
| 587 | run_benchmarks(benchmarks); |
| 588 | |
| 589 | cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl; |
| 590 | for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { |
| 591 | cout << *it << endl; |
| 592 | } |
| 593 | } |
| 594 | }; |
| 595 | |
| 596 | struct measure_default_sizes_action_t : action_t |
| 597 | { |
| 598 | virtual const char* invokation_name() const { return "default-sizes"; } |
| 599 | virtual void run() const |
| 600 | { |
| 601 | vector<benchmark_t> benchmarks; |
| 602 | for (int repetition = 0; repetition < measurement_repetitions; repetition++) { |
| 603 | for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { |
| 604 | for (size_t msize = minsize; msize <= maxsize; msize *= 2) { |
| 605 | for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { |
| 606 | benchmarks.emplace_back(ksize, msize, nsize); |
| 607 | } |
| 608 | } |
| 609 | } |
| 610 | } |
| 611 | |
| 612 | run_benchmarks(benchmarks); |
| 613 | |
| 614 | cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl; |
| 615 | for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { |
| 616 | cout << *it << endl; |
| 617 | } |
| 618 | } |
| 619 | }; |
| 620 | |
| 621 | int main(int argc, char* argv[]) |
| 622 | { |
| 623 | double time_start = timer.getRealTime(); |
| 624 | cout.precision(4); |
| 625 | cerr.precision(4); |
| 626 | |
| 627 | vector<unique_ptr<action_t>> available_actions; |
| 628 | available_actions.emplace_back(new measure_all_pot_sizes_action_t); |
| 629 | available_actions.emplace_back(new measure_default_sizes_action_t); |
| 630 | |
| 631 | auto action = available_actions.end(); |
| 632 | |
| 633 | if (argc <= 1) { |
| 634 | show_usage_and_exit(argc, argv, available_actions); |
| 635 | } |
| 636 | for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { |
| 637 | if (!strcmp(argv[1], (*it)->invokation_name())) { |
| 638 | action = it; |
| 639 | break; |
| 640 | } |
| 641 | } |
| 642 | |
| 643 | if (action == available_actions.end()) { |
| 644 | show_usage_and_exit(argc, argv, available_actions); |
| 645 | } |
| 646 | |
| 647 | for (int i = 2; i < argc; i++) { |
| 648 | if (argv[i] == strstr(argv[i], "--min-working-set-size=")) { |
| 649 | const char* equals_sign = strchr(argv[i], '='); |
| 650 | min_working_set_size = strtoul(equals_sign+1, nullptr, 10); |
| 651 | } else { |
| 652 | cerr << "unrecognized option: " << argv[i] << endl << endl; |
| 653 | show_usage_and_exit(argc, argv, available_actions); |
| 654 | } |
| 655 | } |
| 656 | |
| 657 | print_cpuinfo(); |
| 658 | |
| 659 | cout << "benchmark parameters:" << endl; |
| 660 | cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl; |
| 661 | cout << "scalar type: " << type_name<Scalar>() << endl; |
| 662 | cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl; |
| 663 | cout << "minsize = " << minsize << endl; |
| 664 | cout << "maxsize = " << maxsize << endl; |
| 665 | cout << "measurement_repetitions = " << measurement_repetitions << endl; |
| 666 | cout << "min_accurate_time = " << min_accurate_time << endl; |
| 667 | cout << "min_working_set_size = " << min_working_set_size; |
| 668 | if (min_working_set_size == 0) { |
| 669 | cout << " (try to outsize caches)"; |
| 670 | } |
| 671 | cout << endl << endl; |
| 672 | |
| 673 | (*action)->run(); |
| 674 | |
| 675 | double time_end = timer.getRealTime(); |
| 676 | cerr << "Finished in " << human_duration_t(time_end - time_start) << endl; |
| 677 | } |