Blame - bench/benchmark-blocking-sizes.cpp - RealtimeRoboticsGroup/test

blob: 827be2880294bfa395a20b51f151d0d8387655eb [file] [log] [blame]

Austin Schuh	189376f	2018-12-20 22:11:15 +1100	[diff] [blame^]	1	// This file is part of Eigen, a lightweight C++ template library
				2	// for linear algebra.
				3	//
				4	// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
				5	//
				6	// This Source Code Form is subject to the terms of the Mozilla
				7	// Public License v. 2.0. If a copy of the MPL was not distributed
				8	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
				9
				10	#include <iostream>
				11	#include <cstdint>
				12	#include <cstdlib>
				13	#include <vector>
				14	#include <fstream>
				15	#include <memory>
				16	#include <cstdio>
				17
				18	bool eigen_use_specific_block_size;
				19	int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
				20	#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
				21	#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
				22	#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
				23	#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
				24	#include <Eigen/Core>
				25
				26	#include <bench/BenchTimer.h>
				27
				28	using namespace Eigen;
				29	using namespace std;
				30
				31	static BenchTimer timer;
				32
				33	// how many times we repeat each measurement.
				34	// measurements are randomly shuffled - we're not doing
				35	// all N identical measurements in a row.
				36	const int measurement_repetitions = 3;
				37
				38	// Timings below this value are too short to be accurate,
				39	// we'll repeat measurements with more iterations until
				40	// we get a timing above that threshold.
				41	const float min_accurate_time = 1e-2f;
				42
				43	// See --min-working-set-size command line parameter.
				44	size_t min_working_set_size = 0;
				45
				46	float max_clock_speed = 0.0f;
				47
				48	// range of sizes that we will benchmark (in all 3 K,M,N dimensions)
				49	const size_t maxsize = 2048;
				50	const size_t minsize = 16;
				51
				52	typedef MatrixXf MatrixType;
				53	typedef MatrixType::Scalar Scalar;
				54	typedef internal::packet_traits<Scalar>::type Packet;
				55
				56	static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
				57	static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
				58	static_assert(maxsize > minsize, "maxsize must be larger than minsize");
				59	static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
				60
				61	// just a helper to store a triple of K,M,N sizes for matrix product
				62	struct size_triple_t
				63	{
				64	size_t k, m, n;
				65	size_triple_t() : k(0), m(0), n(0) {}
				66	size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
				67	size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
				68	size_triple_t(uint16_t compact)
				69	{
				70	k = 1 << ((compact & 0xf00) >> 8);
				71	m = 1 << ((compact & 0x0f0) >> 4);
				72	n = 1 << ((compact & 0x00f) >> 0);
				73	}
				74	};
				75
				76	uint8_t log2_pot(size_t x) {
				77	size_t l = 0;
				78	while (x >>= 1) l++;
				79	return l;
				80	}
				81
				82	// Convert between size tripes and a compact form fitting in 12 bits
				83	// where each size, which must be a POT, is encoded as its log2, on 4 bits
				84	// so the largest representable size is 2^15 == 32k ... big enough.
				85	uint16_t compact_size_triple(size_t k, size_t m, size_t n)
				86	{
				87	return (log2_pot(k) << 8) \| (log2_pot(m) << 4) \| log2_pot(n);
				88	}
				89
				90	uint16_t compact_size_triple(const size_triple_t& t)
				91	{
				92	return compact_size_triple(t.k, t.m, t.n);
				93	}
				94
				95	// A single benchmark. Initially only contains benchmark params.
				96	// Then call run(), which stores the result in the gflops field.
				97	struct benchmark_t
				98	{
				99	uint16_t compact_product_size;
				100	uint16_t compact_block_size;
				101	bool use_default_block_size;
				102	float gflops;
				103	benchmark_t()
				104	: compact_product_size(0)
				105	, compact_block_size(0)
				106	, use_default_block_size(false)
				107	, gflops(0)
				108	{
				109	}
				110	benchmark_t(size_t pk, size_t pm, size_t pn,
				111	size_t bk, size_t bm, size_t bn)
				112	: compact_product_size(compact_size_triple(pk, pm, pn))
				113	, compact_block_size(compact_size_triple(bk, bm, bn))
				114	, use_default_block_size(false)
				115	, gflops(0)
				116	{}
				117	benchmark_t(size_t pk, size_t pm, size_t pn)
				118	: compact_product_size(compact_size_triple(pk, pm, pn))
				119	, compact_block_size(0)
				120	, use_default_block_size(true)
				121	, gflops(0)
				122	{}
				123
				124	void run();
				125	};
				126
				127	ostream& operator<<(ostream& s, const benchmark_t& b)
				128	{
				129	s << hex << b.compact_product_size << dec;
				130	if (b.use_default_block_size) {
				131	size_triple_t t(b.compact_product_size);
				132	Index k = t.k, m = t.m, n = t.n;
				133	internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
				134	s << " default(" << k << ", " << m << ", " << n << ")";
				135	} else {
				136	s << " " << hex << b.compact_block_size << dec;
				137	}
				138	s << " " << b.gflops;
				139	return s;
				140	}
				141
				142	// We sort first by increasing benchmark parameters,
				143	// then by decreasing performance.
				144	bool operator<(const benchmark_t& b1, const benchmark_t& b2)
				145	{
				146	return b1.compact_product_size < b2.compact_product_size \|\|
				147	(b1.compact_product_size == b2.compact_product_size && (
				148	(b1.compact_block_size < b2.compact_block_size \|\| (
				149	b1.compact_block_size == b2.compact_block_size &&
				150	b1.gflops > b2.gflops))));
				151	}
				152
				153	void benchmark_t::run()
				154	{
				155	size_triple_t productsizes(compact_product_size);
				156
				157	if (use_default_block_size) {
				158	eigen_use_specific_block_size = false;
				159	} else {
				160	// feed eigen with our custom blocking params
				161	eigen_use_specific_block_size = true;
				162	size_triple_t blocksizes(compact_block_size);
				163	eigen_block_size_k = blocksizes.k;
				164	eigen_block_size_m = blocksizes.m;
				165	eigen_block_size_n = blocksizes.n;
				166	}
				167
				168	// set up the matrix pool
				169
				170	const size_t combined_three_matrices_sizes =
				171	sizeof(Scalar) *
				172	(productsizes.k * productsizes.m +
				173	productsizes.k * productsizes.n +
				174	productsizes.m * productsizes.n);
				175
				176	// 64 M is large enough that nobody has a cache bigger than that,
				177	// while still being small enough that everybody has this much RAM,
				178	// so conveniently we don't need to special-case platforms here.
				179	const size_t unlikely_large_cache_size = 64 << 20;
				180
				181	const size_t working_set_size =
				182	min_working_set_size ? min_working_set_size : unlikely_large_cache_size;
				183
				184	const size_t matrix_pool_size =
				185	1 + working_set_size / combined_three_matrices_sizes;
				186
				187	MatrixType *lhs = new MatrixType[matrix_pool_size];
				188	MatrixType *rhs = new MatrixType[matrix_pool_size];
				189	MatrixType *dst = new MatrixType[matrix_pool_size];
				190
				191	for (size_t i = 0; i < matrix_pool_size; i++) {
				192	lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
				193	rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
				194	dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
				195	}
				196
				197	// main benchmark loop
				198
				199	int iters_at_a_time = 1;
				200	float time_per_iter = 0.0f;
				201	size_t matrix_index = 0;
				202	while (true) {
				203
				204	double starttime = timer.getCpuTime();
				205	for (int i = 0; i < iters_at_a_time; i++) {
				206	dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
				207	matrix_index++;
				208	if (matrix_index == matrix_pool_size) {
				209	matrix_index = 0;
				210	}
				211	}
				212	double endtime = timer.getCpuTime();
				213
				214	const float timing = float(endtime - starttime);
				215
				216	if (timing >= min_accurate_time) {
				217	time_per_iter = timing / iters_at_a_time;
				218	break;
				219	}
				220
				221	iters_at_a_time *= 2;
				222	}
				223
				224	delete[] lhs;
				225	delete[] rhs;
				226	delete[] dst;
				227
				228	gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
				229	}
				230
				231	void print_cpuinfo()
				232	{
				233	#ifdef __linux__
				234	cout << "contents of /proc/cpuinfo:" << endl;
				235	string line;
				236	ifstream cpuinfo("/proc/cpuinfo");
				237	if (cpuinfo.is_open()) {
				238	while (getline(cpuinfo, line)) {
				239	cout << line << endl;
				240	}
				241	cpuinfo.close();
				242	}
				243	cout << endl;
				244	#elif defined __APPLE__
				245	cout << "output of sysctl hw:" << endl;
				246	system("sysctl hw");
				247	cout << endl;
				248	#endif
				249	}
				250
				251	template <typename T>
				252	string type_name()
				253	{
				254	return "unknown";
				255	}
				256
				257	template<>
				258	string type_name<float>()
				259	{
				260	return "float";
				261	}
				262
				263	template<>
				264	string type_name<double>()
				265	{
				266	return "double";
				267	}
				268
				269	struct action_t
				270	{
				271	virtual const char* invokation_name() const { abort(); return nullptr; }
				272	virtual void run() const { abort(); }
				273	virtual ~action_t() {}
				274	};
				275
				276	void show_usage_and_exit(int /argc/, char* argv[],
				277	const vector<unique_ptr<action_t>>& available_actions)
				278	{
				279	cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
				280	cerr << "available actions:" << endl << endl;
				281	for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
				282	cerr << " " << (*it)->invokation_name() << endl;
				283	}
				284	cerr << endl;
				285	cerr << "options:" << endl << endl;
				286	cerr << " --min-working-set-size=N:" << endl;
				287	cerr << " Set the minimum working set size to N bytes." << endl;
				288	cerr << " This is rounded up as needed to a multiple of matrix size." << endl;
				289	cerr << " A larger working set lowers the chance of a warm cache." << endl;
				290	cerr << " The default value 0 means use a large enough working" << endl;
				291	cerr << " set to likely outsize caches." << endl;
				292	cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
				293	cerr << " avoid warm caches." << endl;
				294	exit(1);
				295	}
				296
				297	float measure_clock_speed()
				298	{
				299	cerr << "Measuring clock speed... \r" << flush;
				300
				301	vector<float> all_gflops;
				302	for (int i = 0; i < 8; i++) {
				303	benchmark_t b(1024, 1024, 1024);
				304	b.run();
				305	all_gflops.push_back(b.gflops);
				306	}
				307
				308	sort(all_gflops.begin(), all_gflops.end());
				309	float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
				310
				311	// multiply by an arbitrary constant to discourage trying doing anything with the
				312	// returned values besides just comparing them with each other.
				313	float result = stable_estimate * 123.456f;
				314
				315	return result;
				316	}
				317
				318	struct human_duration_t
				319	{
				320	int seconds;
				321	human_duration_t(int s) : seconds(s) {}
				322	};
				323
				324	ostream& operator<<(ostream& s, const human_duration_t& d)
				325	{
				326	int remainder = d.seconds;
				327	if (remainder > 3600) {
				328	int hours = remainder / 3600;
				329	s << hours << " h ";
				330	remainder -= hours * 3600;
				331	}
				332	if (remainder > 60) {
				333	int minutes = remainder / 60;
				334	s << minutes << " min ";
				335	remainder -= minutes * 60;
				336	}
				337	if (d.seconds < 600) {
				338	s << remainder << " s";
				339	}
				340	return s;
				341	}
				342
				343	const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
				344
				345	void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
				346	{
				347	FILE* file = fopen(filename, "w");
				348	if (!file) {
				349	cerr << "Could not open file " << filename << " for writing." << endl;
				350	cerr << "Do you have write permissions on the current working directory?" << endl;
				351	exit(1);
				352	}
				353	size_t benchmarks_vector_size = benchmarks.size();
				354	fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
				355	fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
				356	fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
				357	fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
				358	fclose(file);
				359	}
				360
				361	bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
				362	{
				363	FILE* file = fopen(filename, "r");
				364	if (!file) {
				365	return false;
				366	}
				367	if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
				368	return false;
				369	}
				370	size_t benchmarks_vector_size = 0;
				371	if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
				372	return false;
				373	}
				374	if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
				375	return false;
				376	}
				377	benchmarks.resize(benchmarks_vector_size);
				378	if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
				379	return false;
				380	}
				381	unlink(filename);
				382	return true;
				383	}
				384
				385	void try_run_some_benchmarks(
				386	vector<benchmark_t>& benchmarks,
				387	double time_start,
				388	size_t& first_benchmark_to_run)
				389	{
				390	if (first_benchmark_to_run == benchmarks.size()) {
				391	return;
				392	}
				393
				394	double time_last_progress_update = 0;
				395	double time_last_clock_speed_measurement = 0;
				396	double time_now = 0;
				397
				398	size_t benchmark_index = first_benchmark_to_run;
				399
				400	while (true) {
				401	float ratio_done = float(benchmark_index) / benchmarks.size();
				402	time_now = timer.getRealTime();
				403
				404	// We check clock speed every minute and at the end.
				405	if (benchmark_index == benchmarks.size() \|\|
				406	time_now > time_last_clock_speed_measurement + 60.0f)
				407	{
				408	time_last_clock_speed_measurement = time_now;
				409
				410	// Ensure that clock speed is as expected
				411	float current_clock_speed = measure_clock_speed();
				412
				413	// The tolerance needs to be smaller than the relative difference between
				414	// clock speeds that a device could operate under.
				415	// It seems unlikely that a device would be throttling clock speeds by
				416	// amounts smaller than 2%.
				417	// With a value of 1%, I was getting within noise on a Sandy Bridge.
				418	const float clock_speed_tolerance = 0.02f;
				419
				420	if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
				421	// Clock speed is now higher than we previously measured.
				422	// Either our initial measurement was inaccurate, which won't happen
				423	// too many times as we are keeping the best clock speed value and
				424	// and allowing some tolerance; or something really weird happened,
				425	// which invalidates all benchmark results collected so far.
				426	// Either way, we better restart all over again now.
				427	if (benchmark_index) {
				428	cerr << "Restarting at " << 100.0f * ratio_done
				429	<< " % because clock speed increased. " << endl;
				430	}
				431	max_clock_speed = current_clock_speed;
				432	first_benchmark_to_run = 0;
				433	return;
				434	}
				435
				436	bool rerun_last_tests = false;
				437
				438	if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
				439	cerr << "Measurements completed so far: "
				440	<< 100.0f * ratio_done
				441	<< " % " << endl;
				442	cerr << "Clock speed seems to be only "
				443	<< current_clock_speed/max_clock_speed
				444	<< " times what it used to be." << endl;
				445
				446	unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
				447
				448	while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
				449	if (seconds_to_sleep_if_lower_clock_speed > 32) {
				450	cerr << "Sleeping longer probably won't make a difference." << endl;
				451	cerr << "Serializing benchmarks to " << session_filename << endl;
				452	serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
				453	cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
				454	exit(2);
				455	}
				456	rerun_last_tests = true;
				457	cerr << "Sleeping "
				458	<< seconds_to_sleep_if_lower_clock_speed
				459	<< " s... \r" << endl;
				460	sleep(seconds_to_sleep_if_lower_clock_speed);
				461	current_clock_speed = measure_clock_speed();
				462	seconds_to_sleep_if_lower_clock_speed *= 2;
				463	}
				464	}
				465
				466	if (rerun_last_tests) {
				467	cerr << "Redoing the last "
				468	<< 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
				469	<< " % because clock speed had been low. " << endl;
				470	return;
				471	}
				472
				473	// nothing wrong with the clock speed so far, so there won't be a need to rerun
				474	// benchmarks run so far in case we later encounter a lower clock speed.
				475	first_benchmark_to_run = benchmark_index;
				476	}
				477
				478	if (benchmark_index == benchmarks.size()) {
				479	// We're done!
				480	first_benchmark_to_run = benchmarks.size();
				481	// Erase progress info
				482	cerr << " " << endl;
				483	return;
				484	}
				485
				486	// Display progress info on stderr
				487	if (time_now > time_last_progress_update + 1.0f) {
				488	time_last_progress_update = time_now;
				489	cerr << "Measurements... " << 100.0f * ratio_done
				490	<< " %, ETA "
				491	<< human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
				492	<< " \r" << flush;
				493	}
				494
				495	// This is where we actually run a benchmark!
				496	benchmarks[benchmark_index].run();
				497	benchmark_index++;
				498	}
				499	}
				500
				501	void run_benchmarks(vector<benchmark_t>& benchmarks)
				502	{
				503	size_t first_benchmark_to_run;
				504	vector<benchmark_t> deserialized_benchmarks;
				505	bool use_deserialized_benchmarks = false;
				506	if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
				507	cerr << "Found serialized session with "
				508	<< 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
				509	<< " % already done" << endl;
				510	if (deserialized_benchmarks.size() == benchmarks.size() &&
				511	first_benchmark_to_run > 0 &&
				512	first_benchmark_to_run < benchmarks.size())
				513	{
				514	use_deserialized_benchmarks = true;
				515	}
				516	}
				517
				518	if (use_deserialized_benchmarks) {
				519	benchmarks = deserialized_benchmarks;
				520	} else {
				521	// not using deserialized benchmarks, starting from scratch
				522	first_benchmark_to_run = 0;
				523
				524	// Randomly shuffling benchmarks allows us to get accurate enough progress info,
				525	// as now the cheap/expensive benchmarks are randomly mixed so they average out.
				526	// It also means that if data is corrupted for some time span, the odds are that
				527	// not all repetitions of a given benchmark will be corrupted.
				528	random_shuffle(benchmarks.begin(), benchmarks.end());
				529	}
				530
				531	for (int i = 0; i < 4; i++) {
				532	max_clock_speed = max(max_clock_speed, measure_clock_speed());
				533	}
				534
				535	double time_start = 0.0;
				536	while (first_benchmark_to_run < benchmarks.size()) {
				537	if (first_benchmark_to_run == 0) {
				538	time_start = timer.getRealTime();
				539	}
				540	try_run_some_benchmarks(benchmarks,
				541	time_start,
				542	first_benchmark_to_run);
				543	}
				544
				545	// Sort timings by increasing benchmark parameters, and decreasing gflops.
				546	// The latter is very important. It means that we can ignore all but the first
				547	// benchmark with given parameters.
				548	sort(benchmarks.begin(), benchmarks.end());
				549
				550	// Collect best (i.e. now first) results for each parameter values.
				551	vector<benchmark_t> best_benchmarks;
				552	for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
				553	if (best_benchmarks.empty() \|\|
				554	best_benchmarks.back().compact_product_size != it->compact_product_size \|\|
				555	best_benchmarks.back().compact_block_size != it->compact_block_size)
				556	{
				557	best_benchmarks.push_back(*it);
				558	}
				559	}
				560
				561	// keep and return only the best benchmarks
				562	benchmarks = best_benchmarks;
				563	}
				564
				565	struct measure_all_pot_sizes_action_t : action_t
				566	{
				567	virtual const char* invokation_name() const { return "all-pot-sizes"; }
				568	virtual void run() const
				569	{
				570	vector<benchmark_t> benchmarks;
				571	for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
				572	for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
				573	for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
				574	for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
				575	for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
				576	for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
				577	for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
				578	benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
				579	}
				580	}
				581	}
				582	}
				583	}
				584	}
				585	}
				586
				587	run_benchmarks(benchmarks);
				588
				589	cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
				590	for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
				591	cout << *it << endl;
				592	}
				593	}
				594	};
				595
				596	struct measure_default_sizes_action_t : action_t
				597	{
				598	virtual const char* invokation_name() const { return "default-sizes"; }
				599	virtual void run() const
				600	{
				601	vector<benchmark_t> benchmarks;
				602	for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
				603	for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
				604	for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
				605	for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
				606	benchmarks.emplace_back(ksize, msize, nsize);
				607	}
				608	}
				609	}
				610	}
				611
				612	run_benchmarks(benchmarks);
				613
				614	cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
				615	for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
				616	cout << *it << endl;
				617	}
				618	}
				619	};
				620
				621	int main(int argc, char* argv[])
				622	{
				623	double time_start = timer.getRealTime();
				624	cout.precision(4);
				625	cerr.precision(4);
				626
				627	vector<unique_ptr<action_t>> available_actions;
				628	available_actions.emplace_back(new measure_all_pot_sizes_action_t);
				629	available_actions.emplace_back(new measure_default_sizes_action_t);
				630
				631	auto action = available_actions.end();
				632
				633	if (argc <= 1) {
				634	show_usage_and_exit(argc, argv, available_actions);
				635	}
				636	for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
				637	if (!strcmp(argv[1], (*it)->invokation_name())) {
				638	action = it;
				639	break;
				640	}
				641	}
				642
				643	if (action == available_actions.end()) {
				644	show_usage_and_exit(argc, argv, available_actions);
				645	}
				646
				647	for (int i = 2; i < argc; i++) {
				648	if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
				649	const char* equals_sign = strchr(argv[i], '=');
				650	min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
				651	} else {
				652	cerr << "unrecognized option: " << argv[i] << endl << endl;
				653	show_usage_and_exit(argc, argv, available_actions);
				654	}
				655	}
				656
				657	print_cpuinfo();
				658
				659	cout << "benchmark parameters:" << endl;
				660	cout << "pointer size: " << 8sizeof(void) << " bits" << endl;
				661	cout << "scalar type: " << type_name<Scalar>() << endl;
				662	cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
				663	cout << "minsize = " << minsize << endl;
				664	cout << "maxsize = " << maxsize << endl;
				665	cout << "measurement_repetitions = " << measurement_repetitions << endl;
				666	cout << "min_accurate_time = " << min_accurate_time << endl;
				667	cout << "min_working_set_size = " << min_working_set_size;
				668	if (min_working_set_size == 0) {
				669	cout << " (try to outsize caches)";
				670	}
				671	cout << endl << endl;
				672
				673	(*action)->run();
				674
				675	double time_end = timer.getRealTime();
				676	cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
				677	}