blob: 7f37800c683003cf8bcf13dc5a46e028209706c5 [file] [log] [blame]
Austin Schuh36244a12019-09-21 17:52:38 -07001// Copyright 2017 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "absl/random/internal/nanobenchmark.h"
16
17#include <sys/types.h>
18
19#include <algorithm> // sort
20#include <atomic>
21#include <cstddef>
22#include <cstdint>
23#include <cstdlib>
24#include <cstring> // memcpy
25#include <limits>
26#include <string>
27#include <utility>
28#include <vector>
29
30#include "absl/base/internal/raw_logging.h"
31#include "absl/random/internal/platform.h"
32#include "absl/random/internal/randen_engine.h"
33
34// OS
35#if defined(_WIN32) || defined(_WIN64)
36#define ABSL_OS_WIN
37#include <windows.h> // NOLINT
38
39#elif defined(__ANDROID__)
40#define ABSL_OS_ANDROID
41
42#elif defined(__linux__)
43#define ABSL_OS_LINUX
44#include <sched.h> // NOLINT
45#include <sys/syscall.h> // NOLINT
46#endif
47
48#if defined(ABSL_ARCH_X86_64) && !defined(ABSL_OS_WIN)
49#include <cpuid.h> // NOLINT
50#endif
51
52// __ppc_get_timebase_freq
53#if defined(ABSL_ARCH_PPC)
54#include <sys/platform/ppc.h> // NOLINT
55#endif
56
57// clock_gettime
58#if defined(ABSL_ARCH_ARM) || defined(ABSL_ARCH_AARCH64)
59#include <time.h> // NOLINT
60#endif
61
62// ABSL_HAVE_ATTRIBUTE
63#if !defined(ABSL_HAVE_ATTRIBUTE)
64#ifdef __has_attribute
65#define ABSL_HAVE_ATTRIBUTE(x) __has_attribute(x)
66#else
67#define ABSL_HAVE_ATTRIBUTE(x) 0
68#endif
69#endif
70
71// ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE prevents inlining of the method.
72#if ABSL_HAVE_ATTRIBUTE(noinline) || (defined(__GNUC__) && !defined(__clang__))
73#define ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE __attribute__((noinline))
74#elif defined(_MSC_VER)
75#define ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE __declspec(noinline)
76#else
77#define ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE
78#endif
79
80namespace absl {
81namespace random_internal_nanobenchmark {
82namespace {
83
84// For code folding.
85namespace platform {
86#if defined(ABSL_ARCH_X86_64)
87
88// TODO(janwas): Merge with the one in randen_hwaes.cc?
89void Cpuid(const uint32_t level, const uint32_t count,
90 uint32_t* ABSL_RANDOM_INTERNAL_RESTRICT abcd) {
91#if defined(ABSL_OS_WIN)
92 int regs[4];
93 __cpuidex(regs, level, count);
94 for (int i = 0; i < 4; ++i) {
95 abcd[i] = regs[i];
96 }
97#else
98 uint32_t a, b, c, d;
99 __cpuid_count(level, count, a, b, c, d);
100 abcd[0] = a;
101 abcd[1] = b;
102 abcd[2] = c;
103 abcd[3] = d;
104#endif
105}
106
107std::string BrandString() {
108 char brand_string[49];
109 uint32_t abcd[4];
110
111 // Check if brand std::string is supported (it is on all reasonable Intel/AMD)
112 Cpuid(0x80000000U, 0, abcd);
113 if (abcd[0] < 0x80000004U) {
114 return std::string();
115 }
116
117 for (int i = 0; i < 3; ++i) {
118 Cpuid(0x80000002U + i, 0, abcd);
119 memcpy(brand_string + i * 16, &abcd, sizeof(abcd));
120 }
121 brand_string[48] = 0;
122 return brand_string;
123}
124
125// Returns the frequency quoted inside the brand string. This does not
126// account for throttling nor Turbo Boost.
127double NominalClockRate() {
128 const std::string& brand_string = BrandString();
129 // Brand strings include the maximum configured frequency. These prefixes are
130 // defined by Intel CPUID documentation.
131 const char* prefixes[3] = {"MHz", "GHz", "THz"};
132 const double multipliers[3] = {1E6, 1E9, 1E12};
133 for (size_t i = 0; i < 3; ++i) {
134 const size_t pos_prefix = brand_string.find(prefixes[i]);
135 if (pos_prefix != std::string::npos) {
136 const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
137 if (pos_space != std::string::npos) {
138 const std::string digits =
139 brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
140 return std::stod(digits) * multipliers[i];
141 }
142 }
143 }
144
145 return 0.0;
146}
147
148#endif // ABSL_ARCH_X86_64
149} // namespace platform
150
151// Prevents the compiler from eliding the computations that led to "output".
152template <class T>
153inline void PreventElision(T&& output) {
154#ifndef ABSL_OS_WIN
155 // Works by indicating to the compiler that "output" is being read and
156 // modified. The +r constraint avoids unnecessary writes to memory, but only
157 // works for built-in types (typically FuncOutput).
158 asm volatile("" : "+r"(output) : : "memory");
159#else
160 // MSVC does not support inline assembly anymore (and never supported GCC's
161 // RTL constraints). Self-assignment with #pragma optimize("off") might be
162 // expected to prevent elision, but it does not with MSVC 2015. Type-punning
163 // with volatile pointers generates inefficient code on MSVC 2017.
164 static std::atomic<T> dummy(T{});
165 dummy.store(output, std::memory_order_relaxed);
166#endif
167}
168
169namespace timer {
170
171// Start/Stop return absolute timestamps and must be placed immediately before
172// and after the region to measure. We provide separate Start/Stop functions
173// because they use different fences.
174//
175// Background: RDTSC is not 'serializing'; earlier instructions may complete
176// after it, and/or later instructions may complete before it. 'Fences' ensure
177// regions' elapsed times are independent of such reordering. The only
178// documented unprivileged serializing instruction is CPUID, which acts as a
179// full fence (no reordering across it in either direction). Unfortunately
180// the latency of CPUID varies wildly (perhaps made worse by not initializing
181// its EAX input). Because it cannot reliably be deducted from the region's
182// elapsed time, it must not be included in the region to measure (i.e.
183// between the two RDTSC).
184//
185// The newer RDTSCP is sometimes described as serializing, but it actually
186// only serves as a half-fence with release semantics. Although all
187// instructions in the region will complete before the final timestamp is
188// captured, subsequent instructions may leak into the region and increase the
189// elapsed time. Inserting another fence after the final RDTSCP would prevent
190// such reordering without affecting the measured region.
191//
192// Fortunately, such a fence exists. The LFENCE instruction is only documented
193// to delay later loads until earlier loads are visible. However, Intel's
194// reference manual says it acts as a full fence (waiting until all earlier
195// instructions have completed, and delaying later instructions until it
196// completes). AMD assigns the same behavior to MFENCE.
197//
198// We need a fence before the initial RDTSC to prevent earlier instructions
199// from leaking into the region, and arguably another after RDTSC to avoid
200// region instructions from completing before the timestamp is recorded.
201// When surrounded by fences, the additional RDTSCP half-fence provides no
202// benefit, so the initial timestamp can be recorded via RDTSC, which has
203// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
204// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
205//
206// Using Start+Start leads to higher variance and overhead than Stop+Stop.
207// However, Stop+Stop includes an LFENCE in the region measurements, which
208// adds a delay dependent on earlier loads. The combination of Start+Stop
209// is faster than Start+Start and more consistent than Stop+Stop because
210// the first LFENCE already delayed subsequent loads before the measured
211// region. This combination seems not to have been considered in prior work:
212// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
213//
214// Note: performance counters can measure 'exact' instructions-retired or
215// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
216// requires fences. Unfortunately, it is not accessible on all OSes and we
217// prefer to avoid kernel-mode drivers. Performance counters are also affected
218// by several under/over-count errata, so we use the TSC instead.
219
220// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
221// divide by InvariantTicksPerSecond.
222inline uint64_t Start64() {
223 uint64_t t;
224#if defined(ABSL_ARCH_PPC)
225 asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
226#elif defined(ABSL_ARCH_X86_64)
227#if defined(ABSL_OS_WIN)
228 _ReadWriteBarrier();
229 _mm_lfence();
230 _ReadWriteBarrier();
231 t = __rdtsc();
232 _ReadWriteBarrier();
233 _mm_lfence();
234 _ReadWriteBarrier();
235#else
236 asm volatile(
237 "lfence\n\t"
238 "rdtsc\n\t"
239 "shl $32, %%rdx\n\t"
240 "or %%rdx, %0\n\t"
241 "lfence"
242 : "=a"(t)
243 :
244 // "memory" avoids reordering. rdx = TSC >> 32.
245 // "cc" = flags modified by SHL.
246 : "rdx", "memory", "cc");
247#endif
248#else
249 // Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
250 timespec ts;
251 clock_gettime(CLOCK_REALTIME, &ts);
252 t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
253#endif
254 return t;
255}
256
257inline uint64_t Stop64() {
258 uint64_t t;
259#if defined(ABSL_ARCH_X86_64)
260#if defined(ABSL_OS_WIN)
261 _ReadWriteBarrier();
262 unsigned aux;
263 t = __rdtscp(&aux);
264 _ReadWriteBarrier();
265 _mm_lfence();
266 _ReadWriteBarrier();
267#else
268 // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
269 asm volatile(
270 "rdtscp\n\t"
271 "shl $32, %%rdx\n\t"
272 "or %%rdx, %0\n\t"
273 "lfence"
274 : "=a"(t)
275 :
276 // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
277 // "cc" = flags modified by SHL.
278 : "rcx", "rdx", "memory", "cc");
279#endif
280#else
281 t = Start64();
282#endif
283 return t;
284}
285
286// Returns a 32-bit timestamp with about 4 cycles less overhead than
287// Start64. Only suitable for measuring very short regions because the
288// timestamp overflows about once a second.
289inline uint32_t Start32() {
290 uint32_t t;
291#if defined(ABSL_ARCH_X86_64)
292#if defined(ABSL_OS_WIN)
293 _ReadWriteBarrier();
294 _mm_lfence();
295 _ReadWriteBarrier();
296 t = static_cast<uint32_t>(__rdtsc());
297 _ReadWriteBarrier();
298 _mm_lfence();
299 _ReadWriteBarrier();
300#else
301 asm volatile(
302 "lfence\n\t"
303 "rdtsc\n\t"
304 "lfence"
305 : "=a"(t)
306 :
307 // "memory" avoids reordering. rdx = TSC >> 32.
308 : "rdx", "memory");
309#endif
310#else
311 t = static_cast<uint32_t>(Start64());
312#endif
313 return t;
314}
315
316inline uint32_t Stop32() {
317 uint32_t t;
318#if defined(ABSL_ARCH_X86_64)
319#if defined(ABSL_OS_WIN)
320 _ReadWriteBarrier();
321 unsigned aux;
322 t = static_cast<uint32_t>(__rdtscp(&aux));
323 _ReadWriteBarrier();
324 _mm_lfence();
325 _ReadWriteBarrier();
326#else
327 // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
328 asm volatile(
329 "rdtscp\n\t"
330 "lfence"
331 : "=a"(t)
332 :
333 // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
334 : "rcx", "rdx", "memory");
335#endif
336#else
337 t = static_cast<uint32_t>(Stop64());
338#endif
339 return t;
340}
341
342} // namespace timer
343
344namespace robust_statistics {
345
346// Sorts integral values in ascending order (e.g. for Mode). About 3x faster
347// than std::sort for input distributions with very few unique values.
348template <class T>
349void CountingSort(T* values, size_t num_values) {
350 // Unique values and their frequency (similar to flat_map).
351 using Unique = std::pair<T, int>;
352 std::vector<Unique> unique;
353 for (size_t i = 0; i < num_values; ++i) {
354 const T value = values[i];
355 const auto pos =
356 std::find_if(unique.begin(), unique.end(),
357 [value](const Unique u) { return u.first == value; });
358 if (pos == unique.end()) {
359 unique.push_back(std::make_pair(value, 1));
360 } else {
361 ++pos->second;
362 }
363 }
364
365 // Sort in ascending order of value (pair.first).
366 std::sort(unique.begin(), unique.end());
367
368 // Write that many copies of each unique value to the array.
369 T* ABSL_RANDOM_INTERNAL_RESTRICT p = values;
370 for (const auto& value_count : unique) {
371 std::fill(p, p + value_count.second, value_count.first);
372 p += value_count.second;
373 }
374 ABSL_RAW_CHECK(p == values + num_values, "Did not produce enough output");
375}
376
377// @return i in [idx_begin, idx_begin + half_count) that minimizes
378// sorted[i + half_count] - sorted[i].
379template <typename T>
380size_t MinRange(const T* const ABSL_RANDOM_INTERNAL_RESTRICT sorted,
381 const size_t idx_begin, const size_t half_count) {
382 T min_range = (std::numeric_limits<T>::max)();
383 size_t min_idx = 0;
384
385 for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
386 ABSL_RAW_CHECK(sorted[idx] <= sorted[idx + half_count], "Not sorted");
387 const T range = sorted[idx + half_count] - sorted[idx];
388 if (range < min_range) {
389 min_range = range;
390 min_idx = idx;
391 }
392 }
393
394 return min_idx;
395}
396
397// Returns an estimate of the mode by calling MinRange on successively
398// halved intervals. "sorted" must be in ascending order. This is the
399// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
400// estimator of the mode", with complexity O(N log N). The mode is less
401// affected by outliers in highly-skewed distributions than the median.
402// The averaging operation below assumes "T" is an unsigned integer type.
403template <typename T>
404T ModeOfSorted(const T* const ABSL_RANDOM_INTERNAL_RESTRICT sorted,
405 const size_t num_values) {
406 size_t idx_begin = 0;
407 size_t half_count = num_values / 2;
408 while (half_count > 1) {
409 idx_begin = MinRange(sorted, idx_begin, half_count);
410 half_count >>= 1;
411 }
412
413 const T x = sorted[idx_begin + 0];
414 if (half_count == 0) {
415 return x;
416 }
417 ABSL_RAW_CHECK(half_count == 1, "Should stop at half_count=1");
418 const T average = (x + sorted[idx_begin + 1] + 1) / 2;
419 return average;
420}
421
422// Returns the mode. Side effect: sorts "values".
423template <typename T>
424T Mode(T* values, const size_t num_values) {
425 CountingSort(values, num_values);
426 return ModeOfSorted(values, num_values);
427}
428
429template <typename T, size_t N>
430T Mode(T (&values)[N]) {
431 return Mode(&values[0], N);
432}
433
434// Returns the median value. Side effect: sorts "values".
435template <typename T>
436T Median(T* values, const size_t num_values) {
437 ABSL_RAW_CHECK(num_values != 0, "Empty input");
438 std::sort(values, values + num_values);
439 const size_t half = num_values / 2;
440 // Odd count: return middle
441 if (num_values % 2) {
442 return values[half];
443 }
444 // Even count: return average of middle two.
445 return (values[half] + values[half - 1] + 1) / 2;
446}
447
448// Returns a robust measure of variability.
449template <typename T>
450T MedianAbsoluteDeviation(const T* values, const size_t num_values,
451 const T median) {
452 ABSL_RAW_CHECK(num_values != 0, "Empty input");
453 std::vector<T> abs_deviations;
454 abs_deviations.reserve(num_values);
455 for (size_t i = 0; i < num_values; ++i) {
456 const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median));
457 abs_deviations.push_back(static_cast<T>(abs));
458 }
459 return Median(abs_deviations.data(), num_values);
460}
461
462} // namespace robust_statistics
463
464// Ticks := platform-specific timer values (CPU cycles on x86). Must be
465// unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
466// read than 64 bit.
467using Ticks = uint32_t;
468
469// Returns timer overhead / minimum measurable difference.
470Ticks TimerResolution() {
471 // Nested loop avoids exceeding stack/L1 capacity.
472 Ticks repetitions[Params::kTimerSamples];
473 for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
474 Ticks samples[Params::kTimerSamples];
475 for (size_t i = 0; i < Params::kTimerSamples; ++i) {
476 const Ticks t0 = timer::Start32();
477 const Ticks t1 = timer::Stop32();
478 samples[i] = t1 - t0;
479 }
480 repetitions[rep] = robust_statistics::Mode(samples);
481 }
482 return robust_statistics::Mode(repetitions);
483}
484
485static const Ticks timer_resolution = TimerResolution();
486
487// Estimates the expected value of "lambda" values with a variable number of
488// samples until the variability "rel_mad" is less than "max_rel_mad".
489template <class Lambda>
490Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
491 const Params& p, const Lambda& lambda) {
492 auto measure_duration = [&lambda]() -> Ticks {
493 const Ticks t0 = timer::Start32();
494 lambda();
495 const Ticks t1 = timer::Stop32();
496 return t1 - t0;
497 };
498
499 // Choose initial samples_per_eval based on a single estimated duration.
500 Ticks est = measure_duration();
501 static const double ticks_per_second = InvariantTicksPerSecond();
502 const size_t ticks_per_eval = ticks_per_second * p.seconds_per_eval;
503 size_t samples_per_eval = ticks_per_eval / est;
504 samples_per_eval = (std::max)(samples_per_eval, p.min_samples_per_eval);
505
506 std::vector<Ticks> samples;
507 samples.reserve(1 + samples_per_eval);
508 samples.push_back(est);
509
510 // Percentage is too strict for tiny differences, so also allow a small
511 // absolute "median absolute deviation".
512 const Ticks max_abs_mad = (timer_resolution + 99) / 100;
513 *rel_mad = 0.0; // ensure initialized
514
515 for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
516 samples.reserve(samples.size() + samples_per_eval);
517 for (size_t i = 0; i < samples_per_eval; ++i) {
518 const Ticks r = measure_duration();
519 samples.push_back(r);
520 }
521
522 if (samples.size() >= p.min_mode_samples) {
523 est = robust_statistics::Mode(samples.data(), samples.size());
524 } else {
525 // For "few" (depends also on the variance) samples, Median is safer.
526 est = robust_statistics::Median(samples.data(), samples.size());
527 }
528 ABSL_RAW_CHECK(est != 0, "Estimator returned zero duration");
529
530 // Median absolute deviation (mad) is a robust measure of 'variability'.
531 const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
532 samples.data(), samples.size(), est);
533 *rel_mad = static_cast<double>(static_cast<int>(abs_mad)) / est;
534
535 if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
536 if (p.verbose) {
537 ABSL_RAW_LOG(INFO,
538 "%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
539 samples.size(), est, abs_mad, *rel_mad * 100.0);
540 }
541 return est;
542 }
543 }
544
545 if (p.verbose) {
546 ABSL_RAW_LOG(WARNING,
547 "rel_mad=%4.2f%% still exceeds %4.2f%% after %6zu samples.\n",
548 *rel_mad * 100.0, max_rel_mad * 100.0, samples.size());
549 }
550 return est;
551}
552
553using InputVec = std::vector<FuncInput>;
554
555// Returns vector of unique input values.
556InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) {
557 InputVec unique(inputs, inputs + num_inputs);
558 std::sort(unique.begin(), unique.end());
559 unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
560 return unique;
561}
562
563// Returns how often we need to call func for sufficient precision, or zero
564// on failure (e.g. the elapsed time is too long for a 32-bit tick count).
565size_t NumSkip(const Func func, const void* arg, const InputVec& unique,
566 const Params& p) {
567 // Min elapsed ticks for any input.
568 Ticks min_duration = ~0u;
569
570 for (const FuncInput input : unique) {
571 // Make sure a 32-bit timer is sufficient.
572 const uint64_t t0 = timer::Start64();
573 PreventElision(func(arg, input));
574 const uint64_t t1 = timer::Stop64();
575 const uint64_t elapsed = t1 - t0;
576 if (elapsed >= (1ULL << 30)) {
577 ABSL_RAW_LOG(WARNING,
578 "Measurement failed: need 64-bit timer for input=%zu\n",
579 static_cast<size_t>(input));
580 return 0;
581 }
582
583 double rel_mad;
584 const Ticks total = SampleUntilStable(
585 p.target_rel_mad, &rel_mad, p,
586 [func, arg, input]() { PreventElision(func(arg, input)); });
587 min_duration = (std::min)(min_duration, total - timer_resolution);
588 }
589
590 // Number of repetitions required to reach the target resolution.
591 const size_t max_skip = p.precision_divisor;
592 // Number of repetitions given the estimated duration.
593 const size_t num_skip =
594 min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
595 if (p.verbose) {
596 ABSL_RAW_LOG(INFO, "res=%u max_skip=%zu min_dur=%u num_skip=%zu\n",
597 timer_resolution, max_skip, min_duration, num_skip);
598 }
599 return num_skip;
600}
601
602// Replicates inputs until we can omit "num_skip" occurrences of an input.
603InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs,
604 const size_t num_unique, const size_t num_skip,
605 const Params& p) {
606 InputVec full;
607 if (num_unique == 1) {
608 full.assign(p.subset_ratio * num_skip, inputs[0]);
609 return full;
610 }
611
612 full.reserve(p.subset_ratio * num_skip * num_inputs);
613 for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) {
614 full.insert(full.end(), inputs, inputs + num_inputs);
615 }
616 absl::random_internal::randen_engine<uint32_t> rng;
617 std::shuffle(full.begin(), full.end(), rng);
618 return full;
619}
620
621// Copies the "full" to "subset" in the same order, but with "num_skip"
622// randomly selected occurrences of "input_to_skip" removed.
623void FillSubset(const InputVec& full, const FuncInput input_to_skip,
624 const size_t num_skip, InputVec* subset) {
625 const size_t count = std::count(full.begin(), full.end(), input_to_skip);
626 // Generate num_skip random indices: which occurrence to skip.
627 std::vector<uint32_t> omit;
628 // Replacement for std::iota, not yet available in MSVC builds.
629 omit.reserve(count);
630 for (size_t i = 0; i < count; ++i) {
631 omit.push_back(i);
632 }
633 // omit[] is the same on every call, but that's OK because they identify the
634 // Nth instance of input_to_skip, so the position within full[] differs.
635 absl::random_internal::randen_engine<uint32_t> rng;
636 std::shuffle(omit.begin(), omit.end(), rng);
637 omit.resize(num_skip);
638 std::sort(omit.begin(), omit.end());
639
640 uint32_t occurrence = ~0u; // 0 after preincrement
641 size_t idx_omit = 0; // cursor within omit[]
642 size_t idx_subset = 0; // cursor within *subset
643 for (const FuncInput next : full) {
644 if (next == input_to_skip) {
645 ++occurrence;
646 // Haven't removed enough already
647 if (idx_omit < num_skip) {
648 // This one is up for removal
649 if (occurrence == omit[idx_omit]) {
650 ++idx_omit;
651 continue;
652 }
653 }
654 }
655 if (idx_subset < subset->size()) {
656 (*subset)[idx_subset++] = next;
657 }
658 }
659 ABSL_RAW_CHECK(idx_subset == subset->size(), "idx_subset not at end");
660 ABSL_RAW_CHECK(idx_omit == omit.size(), "idx_omit not at end");
661 ABSL_RAW_CHECK(occurrence == count - 1, "occurrence not at end");
662}
663
664// Returns total ticks elapsed for all inputs.
665Ticks TotalDuration(const Func func, const void* arg, const InputVec* inputs,
666 const Params& p, double* max_rel_mad) {
667 double rel_mad;
668 const Ticks duration =
669 SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
670 for (const FuncInput input : *inputs) {
671 PreventElision(func(arg, input));
672 }
673 });
674 *max_rel_mad = (std::max)(*max_rel_mad, rel_mad);
675 return duration;
676}
677
678// (Nearly) empty Func for measuring timer overhead/resolution.
679ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE FuncOutput
680EmptyFunc(const void* arg, const FuncInput input) {
681 return input;
682}
683
684// Returns overhead of accessing inputs[] and calling a function; this will
685// be deducted from future TotalDuration return values.
686Ticks Overhead(const void* arg, const InputVec* inputs, const Params& p) {
687 double rel_mad;
688 // Zero tolerance because repeatability is crucial and EmptyFunc is fast.
689 return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
690 for (const FuncInput input : *inputs) {
691 PreventElision(EmptyFunc(arg, input));
692 }
693 });
694}
695
696} // namespace
697
698void PinThreadToCPU(int cpu) {
699 // We might migrate to another CPU before pinning below, but at least cpu
700 // will be one of the CPUs on which this thread ran.
701#if defined(ABSL_OS_WIN)
702 if (cpu < 0) {
703 cpu = static_cast<int>(GetCurrentProcessorNumber());
704 ABSL_RAW_CHECK(cpu >= 0, "PinThreadToCPU detect failed");
705 if (cpu >= 64) {
706 // NOTE: On wine, at least, GetCurrentProcessorNumber() sometimes returns
707 // a value > 64, which is out of range. When this happens, log a message
708 // and don't set a cpu affinity.
709 ABSL_RAW_LOG(ERROR, "Invalid CPU number: %d", cpu);
710 return;
711 }
712 } else if (cpu >= 64) {
713 // User specified an explicit CPU affinity > the valid range.
714 ABSL_RAW_LOG(FATAL, "Invalid CPU number: %d", cpu);
715 }
716 const DWORD_PTR prev = SetThreadAffinityMask(GetCurrentThread(), 1ULL << cpu);
717 ABSL_RAW_CHECK(prev != 0, "SetAffinity failed");
718#elif defined(ABSL_OS_LINUX) && !defined(ABSL_OS_ANDROID)
719 if (cpu < 0) {
720 cpu = sched_getcpu();
721 ABSL_RAW_CHECK(cpu >= 0, "PinThreadToCPU detect failed");
722 }
723 const pid_t pid = 0; // current thread
724 cpu_set_t set;
725 CPU_ZERO(&set);
726 CPU_SET(cpu, &set);
727 const int err = sched_setaffinity(pid, sizeof(set), &set);
728 ABSL_RAW_CHECK(err == 0, "SetAffinity failed");
729#endif
730}
731
732// Returns tick rate. Invariant means the tick counter frequency is independent
733// of CPU throttling or sleep. May be expensive, caller should cache the result.
734double InvariantTicksPerSecond() {
735#if defined(ABSL_ARCH_PPC)
736 return __ppc_get_timebase_freq();
737#elif defined(ABSL_ARCH_X86_64)
738 // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
739 return platform::NominalClockRate();
740#else
741 // Fall back to clock_gettime nanoseconds.
742 return 1E9;
743#endif
744}
745
746size_t MeasureImpl(const Func func, const void* arg, const size_t num_skip,
747 const InputVec& unique, const InputVec& full,
748 const Params& p, Result* results) {
749 const float mul = 1.0f / static_cast<int>(num_skip);
750
751 InputVec subset(full.size() - num_skip);
752 const Ticks overhead = Overhead(arg, &full, p);
753 const Ticks overhead_skip = Overhead(arg, &subset, p);
754 if (overhead < overhead_skip) {
755 ABSL_RAW_LOG(WARNING, "Measurement failed: overhead %u < %u\n", overhead,
756 overhead_skip);
757 return 0;
758 }
759
760 if (p.verbose) {
761 ABSL_RAW_LOG(INFO, "#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(),
762 subset.size(), overhead, overhead_skip);
763 }
764
765 double max_rel_mad = 0.0;
766 const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
767
768 for (size_t i = 0; i < unique.size(); ++i) {
769 FillSubset(full, unique[i], num_skip, &subset);
770 const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
771
772 if (total < total_skip) {
773 ABSL_RAW_LOG(WARNING, "Measurement failed: total %u < %u\n", total,
774 total_skip);
775 return 0;
776 }
777
778 const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
779 results[i].input = unique[i];
780 results[i].ticks = duration * mul;
781 results[i].variability = max_rel_mad;
782 }
783
784 return unique.size();
785}
786
787size_t Measure(const Func func, const void* arg, const FuncInput* inputs,
788 const size_t num_inputs, Result* results, const Params& p) {
789 ABSL_RAW_CHECK(num_inputs != 0, "No inputs");
790
791 const InputVec unique = UniqueInputs(inputs, num_inputs);
792 const size_t num_skip = NumSkip(func, arg, unique, p); // never 0
793 if (num_skip == 0) return 0; // NumSkip already printed error message
794
795 const InputVec full =
796 ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
797
798 // MeasureImpl may fail up to p.max_measure_retries times.
799 for (size_t i = 0; i < p.max_measure_retries; i++) {
800 auto result = MeasureImpl(func, arg, num_skip, unique, full, p, results);
801 if (result != 0) {
802 return result;
803 }
804 }
805 // All retries failed. (Unusual)
806 return 0;
807}
808
809} // namespace random_internal_nanobenchmark
810} // namespace absl