blob: 5085b9319475bfe03f8d2b17f345178f4249ae47 [file] [log] [blame]
Austin Schuhcbc17402019-01-21 21:00:30 -08001import unittest
2"""report.py - Utilities for reporting statistics about benchmark results
3"""
4import os
5import re
6import copy
7
8from scipy.stats import mannwhitneyu
9
10
11class BenchmarkColor(object):
12 def __init__(self, name, code):
13 self.name = name
14 self.code = code
15
16 def __repr__(self):
17 return '%s%r' % (self.__class__.__name__,
18 (self.name, self.code))
19
20 def __format__(self, format):
21 return self.code
22
23
24# Benchmark Colors Enumeration
25BC_NONE = BenchmarkColor('NONE', '')
26BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
27BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
28BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
29BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m')
30BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
31BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
32BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
33BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
34BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
35BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
36BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
37
38UTEST_MIN_REPETITIONS = 2
39UTEST_OPTIMAL_REPETITIONS = 9 # Lowest reasonable number, More is better.
40UTEST_COL_NAME = "_pvalue"
41
42
43def color_format(use_color, fmt_str, *args, **kwargs):
44 """
45 Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
46 'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
47 is False then all color codes in 'args' and 'kwargs' are replaced with
48 the empty string.
49 """
50 assert use_color is True or use_color is False
51 if not use_color:
52 args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
53 for arg in args]
54 kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
55 for key, arg in kwargs.items()}
56 return fmt_str.format(*args, **kwargs)
57
58
59def find_longest_name(benchmark_list):
60 """
61 Return the length of the longest benchmark name in a given list of
62 benchmark JSON objects
63 """
64 longest_name = 1
65 for bc in benchmark_list:
66 if len(bc['name']) > longest_name:
67 longest_name = len(bc['name'])
68 return longest_name
69
70
71def calculate_change(old_val, new_val):
72 """
73 Return a float representing the decimal change between old_val and new_val.
74 """
75 if old_val == 0 and new_val == 0:
76 return 0.0
77 if old_val == 0:
78 return float(new_val - old_val) / (float(old_val + new_val) / 2)
79 return float(new_val - old_val) / abs(old_val)
80
81
82def filter_benchmark(json_orig, family, replacement=""):
83 """
84 Apply a filter to the json, and only leave the 'family' of benchmarks.
85 """
86 regex = re.compile(family)
87 filtered = {}
88 filtered['benchmarks'] = []
89 for be in json_orig['benchmarks']:
90 if not regex.search(be['name']):
91 continue
92 filteredbench = copy.deepcopy(be) # Do NOT modify the old name!
93 filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
94 filtered['benchmarks'].append(filteredbench)
95 return filtered
96
97
98def get_unique_benchmark_names(json):
99 """
100 While *keeping* the order, give all the unique 'names' used for benchmarks.
101 """
102 seen = set()
103 uniqued = [x['name'] for x in json['benchmarks']
104 if x['name'] not in seen and
105 (seen.add(x['name']) or True)]
106 return uniqued
107
108
109def intersect(list1, list2):
110 """
111 Given two lists, get a new list consisting of the elements only contained
112 in *both of the input lists*, while preserving the ordering.
113 """
114 return [x for x in list1 if x in list2]
115
116
117def partition_benchmarks(json1, json2):
118 """
119 While preserving the ordering, find benchmarks with the same names in
120 both of the inputs, and group them.
121 (i.e. partition/filter into groups with common name)
122 """
123 json1_unique_names = get_unique_benchmark_names(json1)
124 json2_unique_names = get_unique_benchmark_names(json2)
125 names = intersect(json1_unique_names, json2_unique_names)
126 partitions = []
127 for name in names:
128 # Pick the time unit from the first entry of the lhs benchmark.
129 time_unit = (x['time_unit']
130 for x in json1['benchmarks'] if x['name'] == name).next()
131 # Filter by name and time unit.
132 lhs = [x for x in json1['benchmarks'] if x['name'] == name and
133 x['time_unit'] == time_unit]
134 rhs = [x for x in json2['benchmarks'] if x['name'] == name and
135 x['time_unit'] == time_unit]
136 partitions.append([lhs, rhs])
137 return partitions
138
139
140def extract_field(partition, field_name):
141 # The count of elements may be different. We want *all* of them.
142 lhs = [x[field_name] for x in partition[0]]
143 rhs = [x[field_name] for x in partition[1]]
144 return [lhs, rhs]
145
146
147def print_utest(partition, utest_alpha, first_col_width, use_color=True):
148 timings_time = extract_field(partition, 'real_time')
149 timings_cpu = extract_field(partition, 'cpu_time')
150
151 min_rep_cnt = min(len(timings_time[0]),
152 len(timings_time[1]),
153 len(timings_cpu[0]),
154 len(timings_cpu[1]))
155
156 # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
157 if min_rep_cnt < UTEST_MIN_REPETITIONS:
158 return []
159
160 def get_utest_color(pval):
161 return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
162
163 time_pvalue = mannwhitneyu(
164 timings_time[0], timings_time[1], alternative='two-sided').pvalue
165 cpu_pvalue = mannwhitneyu(
166 timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
167
168 dsc = "U Test, Repetitions: {} vs {}".format(
169 len(timings_cpu[0]), len(timings_cpu[1]))
170 dsc_color = BC_OKGREEN
171
172 if min_rep_cnt < UTEST_OPTIMAL_REPETITIONS:
173 dsc_color = BC_WARNING
174 dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
175 UTEST_OPTIMAL_REPETITIONS)
176
177 special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{} {}"
178
179 last_name = partition[0][0]['name']
180 return [color_format(use_color,
181 special_str,
182 BC_HEADER,
183 "{}{}".format(last_name, UTEST_COL_NAME),
184 first_col_width,
185 get_utest_color(time_pvalue), time_pvalue,
186 get_utest_color(cpu_pvalue), cpu_pvalue,
187 dsc_color, dsc,
188 endc=BC_ENDC)]
189
190
191def generate_difference_report(
192 json1,
193 json2,
194 display_aggregates_only=False,
195 utest=False,
196 utest_alpha=0.05,
197 use_color=True):
198 """
199 Calculate and report the difference between each test of two benchmarks
200 runs specified as 'json1' and 'json2'.
201 """
202 assert utest is True or utest is False
203 first_col_width = find_longest_name(json1['benchmarks'])
204
205 def find_test(name):
206 for b in json2['benchmarks']:
207 if b['name'] == name:
208 return b
209 return None
210
211 first_col_width = max(
212 first_col_width,
213 len('Benchmark'))
214 first_col_width += len(UTEST_COL_NAME)
215 first_line = "{:<{}s}Time CPU Time Old Time New CPU Old CPU New".format(
216 'Benchmark', 12 + first_col_width)
217 output_strs = [first_line, '-' * len(first_line)]
218
219 partitions = partition_benchmarks(json1, json2)
220 for partition in partitions:
221 # Careful, we may have different repetition count.
222 for i in range(min(len(partition[0]), len(partition[1]))):
223 bn = partition[0][i]
224 other_bench = partition[1][i]
225
226 # *If* we were asked to only display aggregates,
227 # and if it is non-aggregate, then skip it.
228 if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
229 assert bn['run_type'] == other_bench['run_type']
230 if bn['run_type'] != 'aggregate':
231 continue
232
233 fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
234
235 def get_color(res):
236 if res > 0.05:
237 return BC_FAIL
238 elif res > -0.07:
239 return BC_WHITE
240 else:
241 return BC_CYAN
242
243 tres = calculate_change(bn['real_time'], other_bench['real_time'])
244 cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
245 output_strs += [color_format(use_color,
246 fmt_str,
247 BC_HEADER,
248 bn['name'],
249 first_col_width,
250 get_color(tres),
251 tres,
252 get_color(cpures),
253 cpures,
254 bn['real_time'],
255 other_bench['real_time'],
256 bn['cpu_time'],
257 other_bench['cpu_time'],
258 endc=BC_ENDC)]
259
260 # After processing the whole partition, if requested, do the U test.
261 if utest:
262 output_strs += print_utest(partition,
263 utest_alpha=utest_alpha,
264 first_col_width=first_col_width,
265 use_color=use_color)
266
267 return output_strs
268
269
270###############################################################################
271# Unit tests
272
273
274class TestGetUniqueBenchmarkNames(unittest.TestCase):
275 def load_results(self):
276 import json
277 testInputs = os.path.join(
278 os.path.dirname(
279 os.path.realpath(__file__)),
280 'Inputs')
281 testOutput = os.path.join(testInputs, 'test3_run0.json')
282 with open(testOutput, 'r') as f:
283 json = json.load(f)
284 return json
285
286 def test_basic(self):
287 expect_lines = [
288 'BM_One',
289 'BM_Two',
290 'short', # These two are not sorted
291 'medium', # These two are not sorted
292 ]
293 json = self.load_results()
294 output_lines = get_unique_benchmark_names(json)
295 print("\n")
296 print("\n".join(output_lines))
297 self.assertEqual(len(output_lines), len(expect_lines))
298 for i in range(0, len(output_lines)):
299 self.assertEqual(expect_lines[i], output_lines[i])
300
301
302class TestReportDifference(unittest.TestCase):
303 def load_results(self):
304 import json
305 testInputs = os.path.join(
306 os.path.dirname(
307 os.path.realpath(__file__)),
308 'Inputs')
309 testOutput1 = os.path.join(testInputs, 'test1_run1.json')
310 testOutput2 = os.path.join(testInputs, 'test1_run2.json')
311 with open(testOutput1, 'r') as f:
312 json1 = json.load(f)
313 with open(testOutput2, 'r') as f:
314 json2 = json.load(f)
315 return json1, json2
316
317 def test_basic(self):
318 expect_lines = [
319 ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
320 ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
321 ['BM_2xSlower', '+1.0000', '+1.0000', '50', '100', '50', '100'],
322 ['BM_1PercentFaster', '-0.0100', '-0.0100', '100', '99', '100', '99'],
323 ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
324 ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
325 ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
326 ['BM_100xSlower', '+99.0000', '+99.0000',
327 '100', '10000', '100', '10000'],
328 ['BM_100xFaster', '-0.9900', '-0.9900',
329 '10000', '100', '10000', '100'],
330 ['BM_10PercentCPUToTime', '+0.1000',
331 '-0.1000', '100', '110', '100', '90'],
332 ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
333 ['BM_BadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
334 ]
335 json1, json2 = self.load_results()
336 output_lines_with_header = generate_difference_report(
337 json1, json2, use_color=False)
338 output_lines = output_lines_with_header[2:]
339 print("\n")
340 print("\n".join(output_lines_with_header))
341 self.assertEqual(len(output_lines), len(expect_lines))
342 for i in range(0, len(output_lines)):
343 parts = [x for x in output_lines[i].split(' ') if x]
344 self.assertEqual(len(parts), 7)
345 self.assertEqual(expect_lines[i], parts)
346
347
348class TestReportDifferenceBetweenFamilies(unittest.TestCase):
349 def load_result(self):
350 import json
351 testInputs = os.path.join(
352 os.path.dirname(
353 os.path.realpath(__file__)),
354 'Inputs')
355 testOutput = os.path.join(testInputs, 'test2_run.json')
356 with open(testOutput, 'r') as f:
357 json = json.load(f)
358 return json
359
360 def test_basic(self):
361 expect_lines = [
362 ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
363 ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
364 ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
365 ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
366 ]
367 json = self.load_result()
368 json1 = filter_benchmark(json, "BM_Z.ro", ".")
369 json2 = filter_benchmark(json, "BM_O.e", ".")
370 output_lines_with_header = generate_difference_report(
371 json1, json2, use_color=False)
372 output_lines = output_lines_with_header[2:]
373 print("\n")
374 print("\n".join(output_lines_with_header))
375 self.assertEqual(len(output_lines), len(expect_lines))
376 for i in range(0, len(output_lines)):
377 parts = [x for x in output_lines[i].split(' ') if x]
378 self.assertEqual(len(parts), 7)
379 self.assertEqual(expect_lines[i], parts)
380
381
382class TestReportDifferenceWithUTest(unittest.TestCase):
383 def load_results(self):
384 import json
385 testInputs = os.path.join(
386 os.path.dirname(
387 os.path.realpath(__file__)),
388 'Inputs')
389 testOutput1 = os.path.join(testInputs, 'test3_run0.json')
390 testOutput2 = os.path.join(testInputs, 'test3_run1.json')
391 with open(testOutput1, 'r') as f:
392 json1 = json.load(f)
393 with open(testOutput2, 'r') as f:
394 json2 = json.load(f)
395 return json1, json2
396
397 def test_utest(self):
398 expect_lines = []
399 expect_lines = [
400 ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
401 ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
402 ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
403 ['BM_Two_pvalue',
404 '0.6985',
405 '0.6985',
406 'U',
407 'Test,',
408 'Repetitions:',
409 '2',
410 'vs',
411 '2.',
412 'WARNING:',
413 'Results',
414 'unreliable!',
415 '9+',
416 'repetitions',
417 'recommended.'],
418 ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
419 ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
420 ['short_pvalue',
421 '0.7671',
422 '0.1489',
423 'U',
424 'Test,',
425 'Repetitions:',
426 '2',
427 'vs',
428 '3.',
429 'WARNING:',
430 'Results',
431 'unreliable!',
432 '9+',
433 'repetitions',
434 'recommended.'],
435 ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
436 ]
437 json1, json2 = self.load_results()
438 output_lines_with_header = generate_difference_report(
439 json1, json2, utest=True, utest_alpha=0.05, use_color=False)
440 output_lines = output_lines_with_header[2:]
441 print("\n")
442 print("\n".join(output_lines_with_header))
443 self.assertEqual(len(output_lines), len(expect_lines))
444 for i in range(0, len(output_lines)):
445 parts = [x for x in output_lines[i].split(' ') if x]
446 self.assertEqual(expect_lines[i], parts)
447
448
449class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
450 unittest.TestCase):
451 def load_results(self):
452 import json
453 testInputs = os.path.join(
454 os.path.dirname(
455 os.path.realpath(__file__)),
456 'Inputs')
457 testOutput1 = os.path.join(testInputs, 'test3_run0.json')
458 testOutput2 = os.path.join(testInputs, 'test3_run1.json')
459 with open(testOutput1, 'r') as f:
460 json1 = json.load(f)
461 with open(testOutput2, 'r') as f:
462 json2 = json.load(f)
463 return json1, json2
464
465 def test_utest(self):
466 expect_lines = []
467 expect_lines = [
468 ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
469 ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
470 ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
471 ['BM_Two_pvalue',
472 '0.6985',
473 '0.6985',
474 'U',
475 'Test,',
476 'Repetitions:',
477 '2',
478 'vs',
479 '2.',
480 'WARNING:',
481 'Results',
482 'unreliable!',
483 '9+',
484 'repetitions',
485 'recommended.'],
486 ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
487 ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
488 ['short_pvalue',
489 '0.7671',
490 '0.1489',
491 'U',
492 'Test,',
493 'Repetitions:',
494 '2',
495 'vs',
496 '3.',
497 'WARNING:',
498 'Results',
499 'unreliable!',
500 '9+',
501 'repetitions',
502 'recommended.'],
503 ]
504 json1, json2 = self.load_results()
505 output_lines_with_header = generate_difference_report(
506 json1, json2, display_aggregates_only=True,
507 utest=True, utest_alpha=0.05, use_color=False)
508 output_lines = output_lines_with_header[2:]
509 print("\n")
510 print("\n".join(output_lines_with_header))
511 self.assertEqual(len(output_lines), len(expect_lines))
512 for i in range(0, len(output_lines)):
513 parts = [x for x in output_lines[i].split(' ') if x]
514 self.assertEqual(expect_lines[i], parts)
515
516
517if __name__ == '__main__':
518 unittest.main()
519
520# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
521# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
522# kate: indent-mode python; remove-trailing-spaces modified;