Blame - tools/gbench/report.py - RealtimeRoboticsGroup/test

blob: 5085b9319475bfe03f8d2b17f345178f4249ae47 [file] [log] [blame]

Austin Schuh	cbc1740	2019-01-21 21:00:30 -0800	[diff] [blame^]	1	import unittest
				2	"""report.py - Utilities for reporting statistics about benchmark results
				3	"""
				4	import os
				5	import re
				6	import copy
				7
				8	from scipy.stats import mannwhitneyu
				9
				10
				11	class BenchmarkColor(object):
				12	def __init__(self, name, code):
				13	self.name = name
				14	self.code = code
				15
				16	def __repr__(self):
				17	return '%s%r' % (self.__class__.__name__,
				18	(self.name, self.code))
				19
				20	def __format__(self, format):
				21	return self.code
				22
				23
				24	# Benchmark Colors Enumeration
				25	BC_NONE = BenchmarkColor('NONE', '')
				26	BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
				27	BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
				28	BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
				29	BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m')
				30	BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
				31	BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
				32	BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
				33	BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
				34	BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
				35	BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
				36	BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
				37
				38	UTEST_MIN_REPETITIONS = 2
				39	UTEST_OPTIMAL_REPETITIONS = 9 # Lowest reasonable number, More is better.
				40	UTEST_COL_NAME = "_pvalue"
				41
				42
				43	def color_format(use_color, fmt_str, args, *kwargs):
				44	"""
				45	Return the result of 'fmt_str.format(args, *kwargs)' after transforming
				46	'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
				47	is False then all color codes in 'args' and 'kwargs' are replaced with
				48	the empty string.
				49	"""
				50	assert use_color is True or use_color is False
				51	if not use_color:
				52	args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
				53	for arg in args]
				54	kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
				55	for key, arg in kwargs.items()}
				56	return fmt_str.format(args, *kwargs)
				57
				58
				59	def find_longest_name(benchmark_list):
				60	"""
				61	Return the length of the longest benchmark name in a given list of
				62	benchmark JSON objects
				63	"""
				64	longest_name = 1
				65	for bc in benchmark_list:
				66	if len(bc['name']) > longest_name:
				67	longest_name = len(bc['name'])
				68	return longest_name
				69
				70
				71	def calculate_change(old_val, new_val):
				72	"""
				73	Return a float representing the decimal change between old_val and new_val.
				74	"""
				75	if old_val == 0 and new_val == 0:
				76	return 0.0
				77	if old_val == 0:
				78	return float(new_val - old_val) / (float(old_val + new_val) / 2)
				79	return float(new_val - old_val) / abs(old_val)
				80
				81
				82	def filter_benchmark(json_orig, family, replacement=""):
				83	"""
				84	Apply a filter to the json, and only leave the 'family' of benchmarks.
				85	"""
				86	regex = re.compile(family)
				87	filtered = {}
				88	filtered['benchmarks'] = []
				89	for be in json_orig['benchmarks']:
				90	if not regex.search(be['name']):
				91	continue
				92	filteredbench = copy.deepcopy(be) # Do NOT modify the old name!
				93	filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
				94	filtered['benchmarks'].append(filteredbench)
				95	return filtered
				96
				97
				98	def get_unique_benchmark_names(json):
				99	"""
				100	While keeping the order, give all the unique 'names' used for benchmarks.
				101	"""
				102	seen = set()
				103	uniqued = [x['name'] for x in json['benchmarks']
				104	if x['name'] not in seen and
				105	(seen.add(x['name']) or True)]
				106	return uniqued
				107
				108
				109	def intersect(list1, list2):
				110	"""
				111	Given two lists, get a new list consisting of the elements only contained
				112	in both of the input lists, while preserving the ordering.
				113	"""
				114	return [x for x in list1 if x in list2]
				115
				116
				117	def partition_benchmarks(json1, json2):
				118	"""
				119	While preserving the ordering, find benchmarks with the same names in
				120	both of the inputs, and group them.
				121	(i.e. partition/filter into groups with common name)
				122	"""
				123	json1_unique_names = get_unique_benchmark_names(json1)
				124	json2_unique_names = get_unique_benchmark_names(json2)
				125	names = intersect(json1_unique_names, json2_unique_names)
				126	partitions = []
				127	for name in names:
				128	# Pick the time unit from the first entry of the lhs benchmark.
				129	time_unit = (x['time_unit']
				130	for x in json1['benchmarks'] if x['name'] == name).next()
				131	# Filter by name and time unit.
				132	lhs = [x for x in json1['benchmarks'] if x['name'] == name and
				133	x['time_unit'] == time_unit]
				134	rhs = [x for x in json2['benchmarks'] if x['name'] == name and
				135	x['time_unit'] == time_unit]
				136	partitions.append([lhs, rhs])
				137	return partitions
				138
				139
				140	def extract_field(partition, field_name):
				141	# The count of elements may be different. We want all of them.
				142	lhs = [x[field_name] for x in partition[0]]
				143	rhs = [x[field_name] for x in partition[1]]
				144	return [lhs, rhs]
				145
				146
				147	def print_utest(partition, utest_alpha, first_col_width, use_color=True):
				148	timings_time = extract_field(partition, 'real_time')
				149	timings_cpu = extract_field(partition, 'cpu_time')
				150
				151	min_rep_cnt = min(len(timings_time[0]),
				152	len(timings_time[1]),
				153	len(timings_cpu[0]),
				154	len(timings_cpu[1]))
				155
				156	# Does everything has at least UTEST_MIN_REPETITIONS repetitions?
				157	if min_rep_cnt < UTEST_MIN_REPETITIONS:
				158	return []
				159
				160	def get_utest_color(pval):
				161	return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
				162
				163	time_pvalue = mannwhitneyu(
				164	timings_time[0], timings_time[1], alternative='two-sided').pvalue
				165	cpu_pvalue = mannwhitneyu(
				166	timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
				167
				168	dsc = "U Test, Repetitions: {} vs {}".format(
				169	len(timings_cpu[0]), len(timings_cpu[1]))
				170	dsc_color = BC_OKGREEN
				171
				172	if min_rep_cnt < UTEST_OPTIMAL_REPETITIONS:
				173	dsc_color = BC_WARNING
				174	dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
				175	UTEST_OPTIMAL_REPETITIONS)
				176
				177	special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{} {}"
				178
				179	last_name = partition[0][0]['name']
				180	return [color_format(use_color,
				181	special_str,
				182	BC_HEADER,
				183	"{}{}".format(last_name, UTEST_COL_NAME),
				184	first_col_width,
				185	get_utest_color(time_pvalue), time_pvalue,
				186	get_utest_color(cpu_pvalue), cpu_pvalue,
				187	dsc_color, dsc,
				188	endc=BC_ENDC)]
				189
				190
				191	def generate_difference_report(
				192	json1,
				193	json2,
				194	display_aggregates_only=False,
				195	utest=False,
				196	utest_alpha=0.05,
				197	use_color=True):
				198	"""
				199	Calculate and report the difference between each test of two benchmarks
				200	runs specified as 'json1' and 'json2'.
				201	"""
				202	assert utest is True or utest is False
				203	first_col_width = find_longest_name(json1['benchmarks'])
				204
				205	def find_test(name):
				206	for b in json2['benchmarks']:
				207	if b['name'] == name:
				208	return b
				209	return None
				210
				211	first_col_width = max(
				212	first_col_width,
				213	len('Benchmark'))
				214	first_col_width += len(UTEST_COL_NAME)
				215	first_line = "{:<{}s}Time CPU Time Old Time New CPU Old CPU New".format(
				216	'Benchmark', 12 + first_col_width)
				217	output_strs = [first_line, '-' * len(first_line)]
				218
				219	partitions = partition_benchmarks(json1, json2)
				220	for partition in partitions:
				221	# Careful, we may have different repetition count.
				222	for i in range(min(len(partition[0]), len(partition[1]))):
				223	bn = partition[0][i]
				224	other_bench = partition[1][i]
				225
				226	# If we were asked to only display aggregates,
				227	# and if it is non-aggregate, then skip it.
				228	if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
				229	assert bn['run_type'] == other_bench['run_type']
				230	if bn['run_type'] != 'aggregate':
				231	continue
				232
				233	fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
				234
				235	def get_color(res):
				236	if res > 0.05:
				237	return BC_FAIL
				238	elif res > -0.07:
				239	return BC_WHITE
				240	else:
				241	return BC_CYAN
				242
				243	tres = calculate_change(bn['real_time'], other_bench['real_time'])
				244	cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
				245	output_strs += [color_format(use_color,
				246	fmt_str,
				247	BC_HEADER,
				248	bn['name'],
				249	first_col_width,
				250	get_color(tres),
				251	tres,
				252	get_color(cpures),
				253	cpures,
				254	bn['real_time'],
				255	other_bench['real_time'],
				256	bn['cpu_time'],
				257	other_bench['cpu_time'],
				258	endc=BC_ENDC)]
				259
				260	# After processing the whole partition, if requested, do the U test.
				261	if utest:
				262	output_strs += print_utest(partition,
				263	utest_alpha=utest_alpha,
				264	first_col_width=first_col_width,
				265	use_color=use_color)
				266
				267	return output_strs
				268
				269
				270	###############################################################################
				271	# Unit tests
				272
				273
				274	class TestGetUniqueBenchmarkNames(unittest.TestCase):
				275	def load_results(self):
				276	import json
				277	testInputs = os.path.join(
				278	os.path.dirname(
				279	os.path.realpath(__file__)),
				280	'Inputs')
				281	testOutput = os.path.join(testInputs, 'test3_run0.json')
				282	with open(testOutput, 'r') as f:
				283	json = json.load(f)
				284	return json
				285
				286	def test_basic(self):
				287	expect_lines = [
				288	'BM_One',
				289	'BM_Two',
				290	'short', # These two are not sorted
				291	'medium', # These two are not sorted
				292	]
				293	json = self.load_results()
				294	output_lines = get_unique_benchmark_names(json)
				295	print("\n")
				296	print("\n".join(output_lines))
				297	self.assertEqual(len(output_lines), len(expect_lines))
				298	for i in range(0, len(output_lines)):
				299	self.assertEqual(expect_lines[i], output_lines[i])
				300
				301
				302	class TestReportDifference(unittest.TestCase):
				303	def load_results(self):
				304	import json
				305	testInputs = os.path.join(
				306	os.path.dirname(
				307	os.path.realpath(__file__)),
				308	'Inputs')
				309	testOutput1 = os.path.join(testInputs, 'test1_run1.json')
				310	testOutput2 = os.path.join(testInputs, 'test1_run2.json')
				311	with open(testOutput1, 'r') as f:
				312	json1 = json.load(f)
				313	with open(testOutput2, 'r') as f:
				314	json2 = json.load(f)
				315	return json1, json2
				316
				317	def test_basic(self):
				318	expect_lines = [
				319	['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
				320	['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
				321	['BM_2xSlower', '+1.0000', '+1.0000', '50', '100', '50', '100'],
				322	['BM_1PercentFaster', '-0.0100', '-0.0100', '100', '99', '100', '99'],
				323	['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
				324	['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
				325	['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
				326	['BM_100xSlower', '+99.0000', '+99.0000',
				327	'100', '10000', '100', '10000'],
				328	['BM_100xFaster', '-0.9900', '-0.9900',
				329	'10000', '100', '10000', '100'],
				330	['BM_10PercentCPUToTime', '+0.1000',
				331	'-0.1000', '100', '110', '100', '90'],
				332	['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
				333	['BM_BadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
				334	]
				335	json1, json2 = self.load_results()
				336	output_lines_with_header = generate_difference_report(
				337	json1, json2, use_color=False)
				338	output_lines = output_lines_with_header[2:]
				339	print("\n")
				340	print("\n".join(output_lines_with_header))
				341	self.assertEqual(len(output_lines), len(expect_lines))
				342	for i in range(0, len(output_lines)):
				343	parts = [x for x in output_lines[i].split(' ') if x]
				344	self.assertEqual(len(parts), 7)
				345	self.assertEqual(expect_lines[i], parts)
				346
				347
				348	class TestReportDifferenceBetweenFamilies(unittest.TestCase):
				349	def load_result(self):
				350	import json
				351	testInputs = os.path.join(
				352	os.path.dirname(
				353	os.path.realpath(__file__)),
				354	'Inputs')
				355	testOutput = os.path.join(testInputs, 'test2_run.json')
				356	with open(testOutput, 'r') as f:
				357	json = json.load(f)
				358	return json
				359
				360	def test_basic(self):
				361	expect_lines = [
				362	['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
				363	['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
				364	['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
				365	['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
				366	]
				367	json = self.load_result()
				368	json1 = filter_benchmark(json, "BM_Z.ro", ".")
				369	json2 = filter_benchmark(json, "BM_O.e", ".")
				370	output_lines_with_header = generate_difference_report(
				371	json1, json2, use_color=False)
				372	output_lines = output_lines_with_header[2:]
				373	print("\n")
				374	print("\n".join(output_lines_with_header))
				375	self.assertEqual(len(output_lines), len(expect_lines))
				376	for i in range(0, len(output_lines)):
				377	parts = [x for x in output_lines[i].split(' ') if x]
				378	self.assertEqual(len(parts), 7)
				379	self.assertEqual(expect_lines[i], parts)
				380
				381
				382	class TestReportDifferenceWithUTest(unittest.TestCase):
				383	def load_results(self):
				384	import json
				385	testInputs = os.path.join(
				386	os.path.dirname(
				387	os.path.realpath(__file__)),
				388	'Inputs')
				389	testOutput1 = os.path.join(testInputs, 'test3_run0.json')
				390	testOutput2 = os.path.join(testInputs, 'test3_run1.json')
				391	with open(testOutput1, 'r') as f:
				392	json1 = json.load(f)
				393	with open(testOutput2, 'r') as f:
				394	json2 = json.load(f)
				395	return json1, json2
				396
				397	def test_utest(self):
				398	expect_lines = []
				399	expect_lines = [
				400	['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
				401	['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
				402	['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
				403	['BM_Two_pvalue',
				404	'0.6985',
				405	'0.6985',
				406	'U',
				407	'Test,',
				408	'Repetitions:',
				409	'2',
				410	'vs',
				411	'2.',
				412	'WARNING:',
				413	'Results',
				414	'unreliable!',
				415	'9+',
				416	'repetitions',
				417	'recommended.'],
				418	['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
				419	['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
				420	['short_pvalue',
				421	'0.7671',
				422	'0.1489',
				423	'U',
				424	'Test,',
				425	'Repetitions:',
				426	'2',
				427	'vs',
				428	'3.',
				429	'WARNING:',
				430	'Results',
				431	'unreliable!',
				432	'9+',
				433	'repetitions',
				434	'recommended.'],
				435	['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
				436	]
				437	json1, json2 = self.load_results()
				438	output_lines_with_header = generate_difference_report(
				439	json1, json2, utest=True, utest_alpha=0.05, use_color=False)
				440	output_lines = output_lines_with_header[2:]
				441	print("\n")
				442	print("\n".join(output_lines_with_header))
				443	self.assertEqual(len(output_lines), len(expect_lines))
				444	for i in range(0, len(output_lines)):
				445	parts = [x for x in output_lines[i].split(' ') if x]
				446	self.assertEqual(expect_lines[i], parts)
				447
				448
				449	class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
				450	unittest.TestCase):
				451	def load_results(self):
				452	import json
				453	testInputs = os.path.join(
				454	os.path.dirname(
				455	os.path.realpath(__file__)),
				456	'Inputs')
				457	testOutput1 = os.path.join(testInputs, 'test3_run0.json')
				458	testOutput2 = os.path.join(testInputs, 'test3_run1.json')
				459	with open(testOutput1, 'r') as f:
				460	json1 = json.load(f)
				461	with open(testOutput2, 'r') as f:
				462	json2 = json.load(f)
				463	return json1, json2
				464
				465	def test_utest(self):
				466	expect_lines = []
				467	expect_lines = [
				468	['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
				469	['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
				470	['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
				471	['BM_Two_pvalue',
				472	'0.6985',
				473	'0.6985',
				474	'U',
				475	'Test,',
				476	'Repetitions:',
				477	'2',
				478	'vs',
				479	'2.',
				480	'WARNING:',
				481	'Results',
				482	'unreliable!',
				483	'9+',
				484	'repetitions',
				485	'recommended.'],
				486	['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
				487	['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
				488	['short_pvalue',
				489	'0.7671',
				490	'0.1489',
				491	'U',
				492	'Test,',
				493	'Repetitions:',
				494	'2',
				495	'vs',
				496	'3.',
				497	'WARNING:',
				498	'Results',
				499	'unreliable!',
				500	'9+',
				501	'repetitions',
				502	'recommended.'],
				503	]
				504	json1, json2 = self.load_results()
				505	output_lines_with_header = generate_difference_report(
				506	json1, json2, display_aggregates_only=True,
				507	utest=True, utest_alpha=0.05, use_color=False)
				508	output_lines = output_lines_with_header[2:]
				509	print("\n")
				510	print("\n".join(output_lines_with_header))
				511	self.assertEqual(len(output_lines), len(expect_lines))
				512	for i in range(0, len(output_lines)):
				513	parts = [x for x in output_lines[i].split(' ') if x]
				514	self.assertEqual(expect_lines[i], parts)
				515
				516
				517	if __name__ == '__main__':
				518	unittest.main()
				519
				520	# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
				521	# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
				522	# kate: indent-mode python; remove-trailing-spaces modified;