blob: c2fb3dedefb4977f4f7e5b6a4cdda30409ed2912 [file] [log] [blame]
Austin Schuh189376f2018-12-20 22:11:15 +11001#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
2#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
3
4typedef int TensorIndex;
5#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
6
7#include "unsupported/Eigen/CXX11/Tensor"
8#include "benchmark.h"
9
10#define BENCHMARK_RANGE(bench, lo, hi) \
11 BENCHMARK(bench)->Range(lo, hi)
12
13using Eigen::Tensor;
14using Eigen::TensorMap;
15
16// TODO(bsteiner): also templatize on the input type since we have users
17// for int8 as well as floats.
18template <typename Device, typename T> class BenchmarkSuite {
19 public:
20 BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
21 : m_(m), k_(k), n_(n), device_(device) {
22 initialize();
23 }
24
25 BenchmarkSuite(const Device& device, size_t m)
26 : m_(m), k_(m), n_(m), device_(device) {
27 initialize();
28 }
29
30 ~BenchmarkSuite() {
31 device_.deallocate(a_);
32 device_.deallocate(b_);
33 device_.deallocate(c_);
34 }
35
36 void memcpy(int num_iters) {
37 eigen_assert(m_ == k_ && k_ == n_);
38 StartBenchmarkTiming();
39 for (int iter = 0; iter < num_iters; ++iter) {
40 device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
41 }
42 // Record the number of values copied per second
43 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
44 }
45
46 void typeCasting(int num_iters) {
47 eigen_assert(m_ == n_);
48 Eigen::array<TensorIndex, 2> sizes;
49 if (sizeof(T) >= sizeof(int)) {
50 sizes[0] = m_;
51 sizes[1] = k_;
52 } else {
53 sizes[0] = m_ * sizeof(T) / sizeof(int);
54 sizes[1] = k_ * sizeof(T) / sizeof(int);
55 }
56 const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
57 TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
58
59 StartBenchmarkTiming();
60 for (int iter = 0; iter < num_iters; ++iter) {
61 B.device(device_) = A.template cast<T>();
62 }
63 // Record the number of values copied per second
64 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
65 }
66
67 void random(int num_iters) {
68 eigen_assert(m_ == k_ && k_ == n_);
69 Eigen::array<TensorIndex, 2> sizes;
70 sizes[0] = m_;
71 sizes[1] = m_;
72 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
73
74 StartBenchmarkTiming();
75 for (int iter = 0; iter < num_iters; ++iter) {
76 C.device(device_) = C.random();
77 }
78 // Record the number of random numbers generated per second
79 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
80 }
81
82 void slicing(int num_iters) {
83 eigen_assert(m_ == k_ && k_ == n_);
84 Eigen::array<TensorIndex, 2> sizes;
85 sizes[0] = m_;
86 sizes[1] = m_;
87 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
88 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
89 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
90
91 const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
92 const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
93 const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
94 const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
95 const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
96
97 StartBenchmarkTiming();
98 for (int iter = 0; iter < num_iters; ++iter) {
99 C.slice(first_quadrant, quarter_sizes).device(device_) =
100 A.slice(first_quadrant, quarter_sizes);
101 C.slice(second_quadrant, quarter_sizes).device(device_) =
102 B.slice(second_quadrant, quarter_sizes);
103 C.slice(third_quadrant, quarter_sizes).device(device_) =
104 A.slice(third_quadrant, quarter_sizes);
105 C.slice(fourth_quadrant, quarter_sizes).device(device_) =
106 B.slice(fourth_quadrant, quarter_sizes);
107 }
108 // Record the number of values copied from the rhs slice to the lhs slice
109 // each second
110 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
111 }
112
113 void rowChip(int num_iters) {
114 Eigen::array<TensorIndex, 2> input_size;
115 input_size[0] = k_;
116 input_size[1] = n_;
117 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
118 Eigen::array<TensorIndex, 1> output_size;
119 output_size[0] = n_;
120 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
121
122 StartBenchmarkTiming();
123 for (int iter = 0; iter < num_iters; ++iter) {
124 C.device(device_) = B.chip(iter % k_, 0);
125 }
126 // Record the number of values copied from the rhs chip to the lhs.
127 finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
128 }
129
130 void colChip(int num_iters) {
131 Eigen::array<TensorIndex, 2> input_size;
132 input_size[0] = k_;
133 input_size[1] = n_;
134 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
135 Eigen::array<TensorIndex, 1> output_size;
136 output_size[0] = n_;
137 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
138
139 StartBenchmarkTiming();
140 for (int iter = 0; iter < num_iters; ++iter) {
141 C.device(device_) = B.chip(iter % n_, 1);
142 }
143 // Record the number of values copied from the rhs chip to the lhs.
144 finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
145 }
146
147 void shuffling(int num_iters) {
148 eigen_assert(m_ == n_);
149 Eigen::array<TensorIndex, 2> size_a;
150 size_a[0] = m_;
151 size_a[1] = k_;
152 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
153 Eigen::array<TensorIndex, 2> size_b;
154 size_b[0] = k_;
155 size_b[1] = m_;
156 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
157
158 Eigen::array<int, 2> shuffle;
159 shuffle[0] = 1;
160 shuffle[1] = 0;
161
162 StartBenchmarkTiming();
163 for (int iter = 0; iter < num_iters; ++iter) {
164 B.device(device_) = A.shuffle(shuffle);
165 }
166 // Record the number of values shuffled from A and copied to B each second
167 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
168 }
169
170 void padding(int num_iters) {
171 eigen_assert(m_ == k_);
172 Eigen::array<TensorIndex, 2> size_a;
173 size_a[0] = m_;
174 size_a[1] = k_-3;
175 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
176 Eigen::array<TensorIndex, 2> size_b;
177 size_b[0] = k_;
178 size_b[1] = m_;
179 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
180
181#if defined(EIGEN_HAS_INDEX_LIST)
182 Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
183 Eigen::type2indexpair<2, 1> > paddings;
184#else
185 Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
186 paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
187 paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
188#endif
189
190 StartBenchmarkTiming();
191 for (int iter = 0; iter < num_iters; ++iter) {
192 B.device(device_) = A.pad(paddings);
193 }
194 // Record the number of values copied from the padded tensor A each second
195 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
196 }
197
198 void striding(int num_iters) {
199 eigen_assert(m_ == k_);
200 Eigen::array<TensorIndex, 2> size_a;
201 size_a[0] = m_;
202 size_a[1] = k_;
203 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
204 Eigen::array<TensorIndex, 2> size_b;
205 size_b[0] = m_;
206 size_b[1] = k_/2;
207 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
208
209#ifndef EIGEN_HAS_INDEX_LIST
210 Eigen::array<TensorIndex, 2> strides;
211 strides[0] = 1;
212 strides[1] = 2;
213#else
214 // Take advantage of cxx11 to give the compiler information it can use to
215 // optimize the code.
216 Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
217#endif
218
219 StartBenchmarkTiming();
220 for (int iter = 0; iter < num_iters; ++iter) {
221 B.device(device_) = A.stride(strides);
222 }
223 // Record the number of values copied from the padded tensor A each second
224 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
225 }
226
227 void broadcasting(int num_iters) {
228 Eigen::array<TensorIndex, 2> size_a;
229 size_a[0] = m_;
230 size_a[1] = 1;
231 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
232 Eigen::array<TensorIndex, 2> size_c;
233 size_c[0] = m_;
234 size_c[1] = n_;
235 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
236
237#ifndef EIGEN_HAS_INDEX_LIST
238 Eigen::array<int, 2> broadcast;
239 broadcast[0] = 1;
240 broadcast[1] = n_;
241#else
242 // Take advantage of cxx11 to give the compiler information it can use to
243 // optimize the code.
244 Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
245 broadcast.set(1, n_);
246#endif
247
248 StartBenchmarkTiming();
249 for (int iter = 0; iter < num_iters; ++iter) {
250 C.device(device_) = A.broadcast(broadcast);
251 }
252 // Record the number of values broadcasted from A and copied to C each second
253 finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
254 }
255
256 void coeffWiseOp(int num_iters) {
257 eigen_assert(m_ == k_ && k_ == n_);
258 Eigen::array<TensorIndex, 2> sizes;
259 sizes[0] = m_;
260 sizes[1] = m_;
261 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
262 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
263 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
264
265 StartBenchmarkTiming();
266 for (int iter = 0; iter < num_iters; ++iter) {
267 C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
268 }
269 // Record the number of FLOP executed per second (2 multiplications and
270 // 1 addition per value)
271 finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
272 }
273
274 void algebraicFunc(int num_iters) {
275 eigen_assert(m_ == k_ && k_ == n_);
276 Eigen::array<TensorIndex, 2> sizes;
277 sizes[0] = m_;
278 sizes[1] = m_;
279 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
280 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
281 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
282
283 StartBenchmarkTiming();
284 for (int iter = 0; iter < num_iters; ++iter) {
285 C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
286 }
287 // Record the number of FLOP executed per second (assuming one operation
288 // per value)
289 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
290 }
291
292 void transcendentalFunc(int num_iters) {
293 eigen_assert(m_ == k_ && k_ == n_);
294 Eigen::array<TensorIndex, 2> sizes;
295 sizes[0] = m_;
296 sizes[1] = m_;
297 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
298 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
299 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
300
301 StartBenchmarkTiming();
302 for (int iter = 0; iter < num_iters; ++iter) {
303 C.device(device_) = A.exp() + B.log();
304 }
305 // Record the number of FLOP executed per second (assuming one operation
306 // per value)
307 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
308 }
309
310 // Row reduction
311 void rowReduction(int num_iters) {
312 Eigen::array<TensorIndex, 2> input_size;
313 input_size[0] = k_;
314 input_size[1] = n_;
315 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
316 Eigen::array<TensorIndex, 1> output_size;
317 output_size[0] = n_;
318 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
319
320#ifndef EIGEN_HAS_INDEX_LIST
321 Eigen::array<TensorIndex, 1> sum_along_dim;
322 sum_along_dim[0] = 0;
323#else
324 // Take advantage of cxx11 to give the compiler information it can use to
325 // optimize the code.
326 Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
327#endif
328
329 StartBenchmarkTiming();
330 for (int iter = 0; iter < num_iters; ++iter) {
331 C.device(device_) = B.sum(sum_along_dim);
332 }
333 // Record the number of FLOP executed per second (assuming one operation
334 // per value)
335 finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
336 }
337
338 // Column reduction
339 void colReduction(int num_iters) {
340 Eigen::array<TensorIndex, 2> input_size;
341 input_size[0] = k_;
342 input_size[1] = n_;
343 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
344 b_, input_size);
345 Eigen::array<TensorIndex, 1> output_size;
346 output_size[0] = k_;
347 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
348 c_, output_size);
349
350#ifndef EIGEN_HAS_INDEX_LIST
351 Eigen::array<TensorIndex, 1> sum_along_dim;
352 sum_along_dim[0] = 1;
353#else
354 // Take advantage of cxx11 to give the compiler information it can use to
355 // optimize the code.
356 Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
357#endif
358
359 StartBenchmarkTiming();
360 for (int iter = 0; iter < num_iters; ++iter) {
361 C.device(device_) = B.sum(sum_along_dim);
362 }
363 // Record the number of FLOP executed per second (assuming one operation
364 // per value)
365 finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
366 }
367
368 // Full reduction
369 void fullReduction(int num_iters) {
370 Eigen::array<TensorIndex, 2> input_size;
371 input_size[0] = k_;
372 input_size[1] = n_;
373 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
374 b_, input_size);
375 Eigen::array<TensorIndex, 0> output_size;
376 TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
377 c_, output_size);
378
379 StartBenchmarkTiming();
380 for (int iter = 0; iter < num_iters; ++iter) {
381 C.device(device_) = B.sum();
382 }
383 // Record the number of FLOP executed per second (assuming one operation
384 // per value)
385 finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
386 }
387
388 // do a contraction which is equivalent to a matrix multiplication
389 void contraction(int num_iters) {
390 Eigen::array<TensorIndex, 2> sizeA;
391 sizeA[0] = m_;
392 sizeA[1] = k_;
393 Eigen::array<TensorIndex, 2> sizeB;
394 sizeB[0] = k_;
395 sizeB[1] = n_;
396 Eigen::array<TensorIndex, 2> sizeC;
397 sizeC[0] = m_;
398 sizeC[1] = n_;
399
400 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
401 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
402 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
403
404 typedef typename Tensor<T, 2>::DimensionPair DimPair;
405 Eigen::array<DimPair, 1> dims;
406 dims[0] = DimPair(1, 0);
407
408 StartBenchmarkTiming();
409 for (int iter = 0; iter < num_iters; ++iter) {
410 C.device(device_) = A.contract(B, dims);
411 }
412 // Record the number of FLOP executed per second (size_ multiplications and
413 // additions for each value in the resulting tensor)
414 finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
415 }
416
417 void convolution(int num_iters, int kernel_x, int kernel_y) {
418 Eigen::array<TensorIndex, 2> input_sizes;
419 input_sizes[0] = m_;
420 input_sizes[1] = n_;
421 TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
422 Eigen::array<TensorIndex, 2> kernel_sizes;
423 kernel_sizes[0] = kernel_x;
424 kernel_sizes[1] = kernel_y;
425 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
426 Eigen::array<TensorIndex, 2> result_sizes;
427 result_sizes[0] = m_ - kernel_x + 1;
428 result_sizes[1] = n_ - kernel_y + 1;
429 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
430 Eigen::array<TensorIndex, 2> dims;
431 dims[0] = 0;
432 dims[1] = 1;
433
434 StartBenchmarkTiming();
435 for (int iter = 0; iter < num_iters; ++iter) {
436 C.device(device_) = A.convolve(B, dims);
437 }
438 // Record the number of FLOP executed per second (kernel_size
439 // multiplications and additions for each value in the resulting tensor)
440 finalizeBenchmark(static_cast<int64_t>(2) *
441 (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
442 }
443
444 private:
445 void initialize() {
446 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
447 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
448 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
449
450 // Initialize the content of the memory pools to prevent asan from
451 // complaining.
452 device_.memset(a_, 12, m_ * k_ * sizeof(T));
453 device_.memset(b_, 23, k_ * n_ * sizeof(T));
454 device_.memset(c_, 31, m_ * n_ * sizeof(T));
455
456 //BenchmarkUseRealTime();
457 }
458
459 inline void finalizeBenchmark(int64_t num_items) {
460#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
461 if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
462 device_.synchronize();
463 }
464#endif
465 StopBenchmarkTiming();
466 SetBenchmarkFlopsProcessed(num_items);
467 }
468
469
470 TensorIndex m_;
471 TensorIndex k_;
472 TensorIndex n_;
473 T* a_;
474 T* b_;
475 T* c_;
476 Device device_;
477};
478#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_