Blame - bench/tensors/tensor_benchmarks.h - RealtimeRoboticsGroup/test

blob: c2fb3dedefb4977f4f7e5b6a4cdda30409ed2912 [file] [log] [blame]

Austin Schuh	189376f	2018-12-20 22:11:15 +1100	[diff] [blame^]	1	#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
				2	#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
				3
				4	typedef int TensorIndex;
				5	#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
				6
				7	#include "unsupported/Eigen/CXX11/Tensor"
				8	#include "benchmark.h"
				9
				10	#define BENCHMARK_RANGE(bench, lo, hi) \
				11	BENCHMARK(bench)->Range(lo, hi)
				12
				13	using Eigen::Tensor;
				14	using Eigen::TensorMap;
				15
				16	// TODO(bsteiner): also templatize on the input type since we have users
				17	// for int8 as well as floats.
				18	template <typename Device, typename T> class BenchmarkSuite {
				19	public:
				20	BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
				21	: m_(m), k_(k), n_(n), device_(device) {
				22	initialize();
				23	}
				24
				25	BenchmarkSuite(const Device& device, size_t m)
				26	: m_(m), k_(m), n_(m), device_(device) {
				27	initialize();
				28	}
				29
				30	~BenchmarkSuite() {
				31	device_.deallocate(a_);
				32	device_.deallocate(b_);
				33	device_.deallocate(c_);
				34	}
				35
				36	void memcpy(int num_iters) {
				37	eigen_assert(m_ == k_ && k_ == n_);
				38	StartBenchmarkTiming();
				39	for (int iter = 0; iter < num_iters; ++iter) {
				40	device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
				41	}
				42	// Record the number of values copied per second
				43	finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
				44	}
				45
				46	void typeCasting(int num_iters) {
				47	eigen_assert(m_ == n_);
				48	Eigen::array<TensorIndex, 2> sizes;
				49	if (sizeof(T) >= sizeof(int)) {
				50	sizes[0] = m_;
				51	sizes[1] = k_;
				52	} else {
				53	sizes[0] = m_ * sizeof(T) / sizeof(int);
				54	sizes[1] = k_ * sizeof(T) / sizeof(int);
				55	}
				56	const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
				57	TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
				58
				59	StartBenchmarkTiming();
				60	for (int iter = 0; iter < num_iters; ++iter) {
				61	B.device(device_) = A.template cast<T>();
				62	}
				63	// Record the number of values copied per second
				64	finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
				65	}
				66
				67	void random(int num_iters) {
				68	eigen_assert(m_ == k_ && k_ == n_);
				69	Eigen::array<TensorIndex, 2> sizes;
				70	sizes[0] = m_;
				71	sizes[1] = m_;
				72	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
				73
				74	StartBenchmarkTiming();
				75	for (int iter = 0; iter < num_iters; ++iter) {
				76	C.device(device_) = C.random();
				77	}
				78	// Record the number of random numbers generated per second
				79	finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
				80	}
				81
				82	void slicing(int num_iters) {
				83	eigen_assert(m_ == k_ && k_ == n_);
				84	Eigen::array<TensorIndex, 2> sizes;
				85	sizes[0] = m_;
				86	sizes[1] = m_;
				87	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
				88	const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
				89	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
				90
				91	const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
				92	const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
				93	const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
				94	const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
				95	const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
				96
				97	StartBenchmarkTiming();
				98	for (int iter = 0; iter < num_iters; ++iter) {
				99	C.slice(first_quadrant, quarter_sizes).device(device_) =
				100	A.slice(first_quadrant, quarter_sizes);
				101	C.slice(second_quadrant, quarter_sizes).device(device_) =
				102	B.slice(second_quadrant, quarter_sizes);
				103	C.slice(third_quadrant, quarter_sizes).device(device_) =
				104	A.slice(third_quadrant, quarter_sizes);
				105	C.slice(fourth_quadrant, quarter_sizes).device(device_) =
				106	B.slice(fourth_quadrant, quarter_sizes);
				107	}
				108	// Record the number of values copied from the rhs slice to the lhs slice
				109	// each second
				110	finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
				111	}
				112
				113	void rowChip(int num_iters) {
				114	Eigen::array<TensorIndex, 2> input_size;
				115	input_size[0] = k_;
				116	input_size[1] = n_;
				117	const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
				118	Eigen::array<TensorIndex, 1> output_size;
				119	output_size[0] = n_;
				120	TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
				121
				122	StartBenchmarkTiming();
				123	for (int iter = 0; iter < num_iters; ++iter) {
				124	C.device(device_) = B.chip(iter % k_, 0);
				125	}
				126	// Record the number of values copied from the rhs chip to the lhs.
				127	finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
				128	}
				129
				130	void colChip(int num_iters) {
				131	Eigen::array<TensorIndex, 2> input_size;
				132	input_size[0] = k_;
				133	input_size[1] = n_;
				134	const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
				135	Eigen::array<TensorIndex, 1> output_size;
				136	output_size[0] = n_;
				137	TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
				138
				139	StartBenchmarkTiming();
				140	for (int iter = 0; iter < num_iters; ++iter) {
				141	C.device(device_) = B.chip(iter % n_, 1);
				142	}
				143	// Record the number of values copied from the rhs chip to the lhs.
				144	finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
				145	}
				146
				147	void shuffling(int num_iters) {
				148	eigen_assert(m_ == n_);
				149	Eigen::array<TensorIndex, 2> size_a;
				150	size_a[0] = m_;
				151	size_a[1] = k_;
				152	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
				153	Eigen::array<TensorIndex, 2> size_b;
				154	size_b[0] = k_;
				155	size_b[1] = m_;
				156	TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
				157
				158	Eigen::array<int, 2> shuffle;
				159	shuffle[0] = 1;
				160	shuffle[1] = 0;
				161
				162	StartBenchmarkTiming();
				163	for (int iter = 0; iter < num_iters; ++iter) {
				164	B.device(device_) = A.shuffle(shuffle);
				165	}
				166	// Record the number of values shuffled from A and copied to B each second
				167	finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
				168	}
				169
				170	void padding(int num_iters) {
				171	eigen_assert(m_ == k_);
				172	Eigen::array<TensorIndex, 2> size_a;
				173	size_a[0] = m_;
				174	size_a[1] = k_-3;
				175	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
				176	Eigen::array<TensorIndex, 2> size_b;
				177	size_b[0] = k_;
				178	size_b[1] = m_;
				179	TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
				180
				181	#if defined(EIGEN_HAS_INDEX_LIST)
				182	Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
				183	Eigen::type2indexpair<2, 1> > paddings;
				184	#else
				185	Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
				186	paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
				187	paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
				188	#endif
				189
				190	StartBenchmarkTiming();
				191	for (int iter = 0; iter < num_iters; ++iter) {
				192	B.device(device_) = A.pad(paddings);
				193	}
				194	// Record the number of values copied from the padded tensor A each second
				195	finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
				196	}
				197
				198	void striding(int num_iters) {
				199	eigen_assert(m_ == k_);
				200	Eigen::array<TensorIndex, 2> size_a;
				201	size_a[0] = m_;
				202	size_a[1] = k_;
				203	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
				204	Eigen::array<TensorIndex, 2> size_b;
				205	size_b[0] = m_;
				206	size_b[1] = k_/2;
				207	TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
				208
				209	#ifndef EIGEN_HAS_INDEX_LIST
				210	Eigen::array<TensorIndex, 2> strides;
				211	strides[0] = 1;
				212	strides[1] = 2;
				213	#else
				214	// Take advantage of cxx11 to give the compiler information it can use to
				215	// optimize the code.
				216	Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
				217	#endif
				218
				219	StartBenchmarkTiming();
				220	for (int iter = 0; iter < num_iters; ++iter) {
				221	B.device(device_) = A.stride(strides);
				222	}
				223	// Record the number of values copied from the padded tensor A each second
				224	finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
				225	}
				226
				227	void broadcasting(int num_iters) {
				228	Eigen::array<TensorIndex, 2> size_a;
				229	size_a[0] = m_;
				230	size_a[1] = 1;
				231	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
				232	Eigen::array<TensorIndex, 2> size_c;
				233	size_c[0] = m_;
				234	size_c[1] = n_;
				235	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
				236
				237	#ifndef EIGEN_HAS_INDEX_LIST
				238	Eigen::array<int, 2> broadcast;
				239	broadcast[0] = 1;
				240	broadcast[1] = n_;
				241	#else
				242	// Take advantage of cxx11 to give the compiler information it can use to
				243	// optimize the code.
				244	Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
				245	broadcast.set(1, n_);
				246	#endif
				247
				248	StartBenchmarkTiming();
				249	for (int iter = 0; iter < num_iters; ++iter) {
				250	C.device(device_) = A.broadcast(broadcast);
				251	}
				252	// Record the number of values broadcasted from A and copied to C each second
				253	finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
				254	}
				255
				256	void coeffWiseOp(int num_iters) {
				257	eigen_assert(m_ == k_ && k_ == n_);
				258	Eigen::array<TensorIndex, 2> sizes;
				259	sizes[0] = m_;
				260	sizes[1] = m_;
				261	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
				262	const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
				263	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
				264
				265	StartBenchmarkTiming();
				266	for (int iter = 0; iter < num_iters; ++iter) {
				267	C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
				268	}
				269	// Record the number of FLOP executed per second (2 multiplications and
				270	// 1 addition per value)
				271	finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
				272	}
				273
				274	void algebraicFunc(int num_iters) {
				275	eigen_assert(m_ == k_ && k_ == n_);
				276	Eigen::array<TensorIndex, 2> sizes;
				277	sizes[0] = m_;
				278	sizes[1] = m_;
				279	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
				280	const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
				281	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
				282
				283	StartBenchmarkTiming();
				284	for (int iter = 0; iter < num_iters; ++iter) {
				285	C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
				286	}
				287	// Record the number of FLOP executed per second (assuming one operation
				288	// per value)
				289	finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
				290	}
				291
				292	void transcendentalFunc(int num_iters) {
				293	eigen_assert(m_ == k_ && k_ == n_);
				294	Eigen::array<TensorIndex, 2> sizes;
				295	sizes[0] = m_;
				296	sizes[1] = m_;
				297	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
				298	const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
				299	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
				300
				301	StartBenchmarkTiming();
				302	for (int iter = 0; iter < num_iters; ++iter) {
				303	C.device(device_) = A.exp() + B.log();
				304	}
				305	// Record the number of FLOP executed per second (assuming one operation
				306	// per value)
				307	finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
				308	}
				309
				310	// Row reduction
				311	void rowReduction(int num_iters) {
				312	Eigen::array<TensorIndex, 2> input_size;
				313	input_size[0] = k_;
				314	input_size[1] = n_;
				315	const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
				316	Eigen::array<TensorIndex, 1> output_size;
				317	output_size[0] = n_;
				318	TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
				319
				320	#ifndef EIGEN_HAS_INDEX_LIST
				321	Eigen::array<TensorIndex, 1> sum_along_dim;
				322	sum_along_dim[0] = 0;
				323	#else
				324	// Take advantage of cxx11 to give the compiler information it can use to
				325	// optimize the code.
				326	Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
				327	#endif
				328
				329	StartBenchmarkTiming();
				330	for (int iter = 0; iter < num_iters; ++iter) {
				331	C.device(device_) = B.sum(sum_along_dim);
				332	}
				333	// Record the number of FLOP executed per second (assuming one operation
				334	// per value)
				335	finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
				336	}
				337
				338	// Column reduction
				339	void colReduction(int num_iters) {
				340	Eigen::array<TensorIndex, 2> input_size;
				341	input_size[0] = k_;
				342	input_size[1] = n_;
				343	const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
				344	b_, input_size);
				345	Eigen::array<TensorIndex, 1> output_size;
				346	output_size[0] = k_;
				347	TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
				348	c_, output_size);
				349
				350	#ifndef EIGEN_HAS_INDEX_LIST
				351	Eigen::array<TensorIndex, 1> sum_along_dim;
				352	sum_along_dim[0] = 1;
				353	#else
				354	// Take advantage of cxx11 to give the compiler information it can use to
				355	// optimize the code.
				356	Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
				357	#endif
				358
				359	StartBenchmarkTiming();
				360	for (int iter = 0; iter < num_iters; ++iter) {
				361	C.device(device_) = B.sum(sum_along_dim);
				362	}
				363	// Record the number of FLOP executed per second (assuming one operation
				364	// per value)
				365	finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
				366	}
				367
				368	// Full reduction
				369	void fullReduction(int num_iters) {
				370	Eigen::array<TensorIndex, 2> input_size;
				371	input_size[0] = k_;
				372	input_size[1] = n_;
				373	const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
				374	b_, input_size);
				375	Eigen::array<TensorIndex, 0> output_size;
				376	TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
				377	c_, output_size);
				378
				379	StartBenchmarkTiming();
				380	for (int iter = 0; iter < num_iters; ++iter) {
				381	C.device(device_) = B.sum();
				382	}
				383	// Record the number of FLOP executed per second (assuming one operation
				384	// per value)
				385	finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
				386	}
				387
				388	// do a contraction which is equivalent to a matrix multiplication
				389	void contraction(int num_iters) {
				390	Eigen::array<TensorIndex, 2> sizeA;
				391	sizeA[0] = m_;
				392	sizeA[1] = k_;
				393	Eigen::array<TensorIndex, 2> sizeB;
				394	sizeB[0] = k_;
				395	sizeB[1] = n_;
				396	Eigen::array<TensorIndex, 2> sizeC;
				397	sizeC[0] = m_;
				398	sizeC[1] = n_;
				399
				400	const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
				401	const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
				402	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
				403
				404	typedef typename Tensor<T, 2>::DimensionPair DimPair;
				405	Eigen::array<DimPair, 1> dims;
				406	dims[0] = DimPair(1, 0);
				407
				408	StartBenchmarkTiming();
				409	for (int iter = 0; iter < num_iters; ++iter) {
				410	C.device(device_) = A.contract(B, dims);
				411	}
				412	// Record the number of FLOP executed per second (size_ multiplications and
				413	// additions for each value in the resulting tensor)
				414	finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
				415	}
				416
				417	void convolution(int num_iters, int kernel_x, int kernel_y) {
				418	Eigen::array<TensorIndex, 2> input_sizes;
				419	input_sizes[0] = m_;
				420	input_sizes[1] = n_;
				421	TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
				422	Eigen::array<TensorIndex, 2> kernel_sizes;
				423	kernel_sizes[0] = kernel_x;
				424	kernel_sizes[1] = kernel_y;
				425	TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
				426	Eigen::array<TensorIndex, 2> result_sizes;
				427	result_sizes[0] = m_ - kernel_x + 1;
				428	result_sizes[1] = n_ - kernel_y + 1;
				429	TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
				430	Eigen::array<TensorIndex, 2> dims;
				431	dims[0] = 0;
				432	dims[1] = 1;
				433
				434	StartBenchmarkTiming();
				435	for (int iter = 0; iter < num_iters; ++iter) {
				436	C.device(device_) = A.convolve(B, dims);
				437	}
				438	// Record the number of FLOP executed per second (kernel_size
				439	// multiplications and additions for each value in the resulting tensor)
				440	finalizeBenchmark(static_cast<int64_t>(2) *
				441	(m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
				442	}
				443
				444	private:
				445	void initialize() {
				446	a_ = (T ) device_.allocate(m_ k_ * sizeof(T));
				447	b_ = (T ) device_.allocate(k_ n_ * sizeof(T));
				448	c_ = (T ) device_.allocate(m_ n_ * sizeof(T));
				449
				450	// Initialize the content of the memory pools to prevent asan from
				451	// complaining.
				452	device_.memset(a_, 12, m_ * k_ * sizeof(T));
				453	device_.memset(b_, 23, k_ * n_ * sizeof(T));
				454	device_.memset(c_, 31, m_ * n_ * sizeof(T));
				455
				456	//BenchmarkUseRealTime();
				457	}
				458
				459	inline void finalizeBenchmark(int64_t num_items) {
				460	#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
				461	if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
				462	device_.synchronize();
				463	}
				464	#endif
				465	StopBenchmarkTiming();
				466	SetBenchmarkFlopsProcessed(num_items);
				467	}
				468
				469
				470	TensorIndex m_;
				471	TensorIndex k_;
				472	TensorIndex n_;
				473	T* a_;
				474	T* b_;
				475	T* c_;
				476	Device device_;
				477	};
				478	#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_