Austin Schuh | cbc1740 | 2019-01-21 21:00:30 -0800 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | |
| 3 | import unittest |
| 4 | """ |
| 5 | compare.py - versatile benchmark output compare tool |
| 6 | """ |
| 7 | |
| 8 | import argparse |
| 9 | from argparse import ArgumentParser |
| 10 | import sys |
| 11 | import gbench |
| 12 | from gbench import util, report |
| 13 | from gbench.util import * |
| 14 | |
| 15 | |
| 16 | def check_inputs(in1, in2, flags): |
| 17 | """ |
| 18 | Perform checking on the user provided inputs and diagnose any abnormalities |
| 19 | """ |
| 20 | in1_kind, in1_err = classify_input_file(in1) |
| 21 | in2_kind, in2_err = classify_input_file(in2) |
| 22 | output_file = find_benchmark_flag('--benchmark_out=', flags) |
| 23 | output_type = find_benchmark_flag('--benchmark_out_format=', flags) |
| 24 | if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file: |
| 25 | print(("WARNING: '--benchmark_out=%s' will be passed to both " |
| 26 | "benchmarks causing it to be overwritten") % output_file) |
| 27 | if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0: |
| 28 | print("WARNING: passing optional flags has no effect since both " |
| 29 | "inputs are JSON") |
| 30 | if output_type is not None and output_type != 'json': |
| 31 | print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`" |
| 32 | " is not supported.") % output_type) |
| 33 | sys.exit(1) |
| 34 | |
| 35 | |
| 36 | def create_parser(): |
| 37 | parser = ArgumentParser( |
| 38 | description='versatile benchmark output compare tool') |
| 39 | |
| 40 | parser.add_argument( |
| 41 | '-a', |
| 42 | '--display_aggregates_only', |
| 43 | dest='display_aggregates_only', |
| 44 | action="store_true", |
| 45 | help="If there are repetitions, by default, we display everything - the" |
| 46 | " actual runs, and the aggregates computed. Sometimes, it is " |
| 47 | "desirable to only view the aggregates. E.g. when there are a lot " |
| 48 | "of repetitions. Do note that only the display is affected. " |
| 49 | "Internally, all the actual runs are still used, e.g. for U test.") |
| 50 | |
| 51 | utest = parser.add_argument_group() |
| 52 | utest.add_argument( |
| 53 | '--no-utest', |
| 54 | dest='utest', |
| 55 | default=True, |
| 56 | action="store_false", |
| 57 | help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS)) |
| 58 | alpha_default = 0.05 |
| 59 | utest.add_argument( |
| 60 | "--alpha", |
| 61 | dest='utest_alpha', |
| 62 | default=alpha_default, |
| 63 | type=float, |
| 64 | help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") % |
| 65 | alpha_default) |
| 66 | |
| 67 | subparsers = parser.add_subparsers( |
| 68 | help='This tool has multiple modes of operation:', |
| 69 | dest='mode') |
| 70 | |
| 71 | parser_a = subparsers.add_parser( |
| 72 | 'benchmarks', |
| 73 | help='The most simple use-case, compare all the output of these two benchmarks') |
| 74 | baseline = parser_a.add_argument_group( |
| 75 | 'baseline', 'The benchmark baseline') |
| 76 | baseline.add_argument( |
| 77 | 'test_baseline', |
| 78 | metavar='test_baseline', |
| 79 | type=argparse.FileType('r'), |
| 80 | nargs=1, |
| 81 | help='A benchmark executable or JSON output file') |
| 82 | contender = parser_a.add_argument_group( |
| 83 | 'contender', 'The benchmark that will be compared against the baseline') |
| 84 | contender.add_argument( |
| 85 | 'test_contender', |
| 86 | metavar='test_contender', |
| 87 | type=argparse.FileType('r'), |
| 88 | nargs=1, |
| 89 | help='A benchmark executable or JSON output file') |
| 90 | parser_a.add_argument( |
| 91 | 'benchmark_options', |
| 92 | metavar='benchmark_options', |
| 93 | nargs=argparse.REMAINDER, |
| 94 | help='Arguments to pass when running benchmark executables') |
| 95 | |
| 96 | parser_b = subparsers.add_parser( |
| 97 | 'filters', help='Compare filter one with the filter two of benchmark') |
| 98 | baseline = parser_b.add_argument_group( |
| 99 | 'baseline', 'The benchmark baseline') |
| 100 | baseline.add_argument( |
| 101 | 'test', |
| 102 | metavar='test', |
| 103 | type=argparse.FileType('r'), |
| 104 | nargs=1, |
| 105 | help='A benchmark executable or JSON output file') |
| 106 | baseline.add_argument( |
| 107 | 'filter_baseline', |
| 108 | metavar='filter_baseline', |
| 109 | type=str, |
| 110 | nargs=1, |
| 111 | help='The first filter, that will be used as baseline') |
| 112 | contender = parser_b.add_argument_group( |
| 113 | 'contender', 'The benchmark that will be compared against the baseline') |
| 114 | contender.add_argument( |
| 115 | 'filter_contender', |
| 116 | metavar='filter_contender', |
| 117 | type=str, |
| 118 | nargs=1, |
| 119 | help='The second filter, that will be compared against the baseline') |
| 120 | parser_b.add_argument( |
| 121 | 'benchmark_options', |
| 122 | metavar='benchmark_options', |
| 123 | nargs=argparse.REMAINDER, |
| 124 | help='Arguments to pass when running benchmark executables') |
| 125 | |
| 126 | parser_c = subparsers.add_parser( |
| 127 | 'benchmarksfiltered', |
| 128 | help='Compare filter one of first benchmark with filter two of the second benchmark') |
| 129 | baseline = parser_c.add_argument_group( |
| 130 | 'baseline', 'The benchmark baseline') |
| 131 | baseline.add_argument( |
| 132 | 'test_baseline', |
| 133 | metavar='test_baseline', |
| 134 | type=argparse.FileType('r'), |
| 135 | nargs=1, |
| 136 | help='A benchmark executable or JSON output file') |
| 137 | baseline.add_argument( |
| 138 | 'filter_baseline', |
| 139 | metavar='filter_baseline', |
| 140 | type=str, |
| 141 | nargs=1, |
| 142 | help='The first filter, that will be used as baseline') |
| 143 | contender = parser_c.add_argument_group( |
| 144 | 'contender', 'The benchmark that will be compared against the baseline') |
| 145 | contender.add_argument( |
| 146 | 'test_contender', |
| 147 | metavar='test_contender', |
| 148 | type=argparse.FileType('r'), |
| 149 | nargs=1, |
| 150 | help='The second benchmark executable or JSON output file, that will be compared against the baseline') |
| 151 | contender.add_argument( |
| 152 | 'filter_contender', |
| 153 | metavar='filter_contender', |
| 154 | type=str, |
| 155 | nargs=1, |
| 156 | help='The second filter, that will be compared against the baseline') |
| 157 | parser_c.add_argument( |
| 158 | 'benchmark_options', |
| 159 | metavar='benchmark_options', |
| 160 | nargs=argparse.REMAINDER, |
| 161 | help='Arguments to pass when running benchmark executables') |
| 162 | |
| 163 | return parser |
| 164 | |
| 165 | |
| 166 | def main(): |
| 167 | # Parse the command line flags |
| 168 | parser = create_parser() |
| 169 | args, unknown_args = parser.parse_known_args() |
| 170 | if args.mode is None: |
| 171 | parser.print_help() |
| 172 | exit(1) |
| 173 | assert not unknown_args |
| 174 | benchmark_options = args.benchmark_options |
| 175 | |
| 176 | if args.mode == 'benchmarks': |
| 177 | test_baseline = args.test_baseline[0].name |
| 178 | test_contender = args.test_contender[0].name |
| 179 | filter_baseline = '' |
| 180 | filter_contender = '' |
| 181 | |
| 182 | # NOTE: if test_baseline == test_contender, you are analyzing the stdev |
| 183 | |
| 184 | description = 'Comparing %s to %s' % (test_baseline, test_contender) |
| 185 | elif args.mode == 'filters': |
| 186 | test_baseline = args.test[0].name |
| 187 | test_contender = args.test[0].name |
| 188 | filter_baseline = args.filter_baseline[0] |
| 189 | filter_contender = args.filter_contender[0] |
| 190 | |
| 191 | # NOTE: if filter_baseline == filter_contender, you are analyzing the |
| 192 | # stdev |
| 193 | |
| 194 | description = 'Comparing %s to %s (from %s)' % ( |
| 195 | filter_baseline, filter_contender, args.test[0].name) |
| 196 | elif args.mode == 'benchmarksfiltered': |
| 197 | test_baseline = args.test_baseline[0].name |
| 198 | test_contender = args.test_contender[0].name |
| 199 | filter_baseline = args.filter_baseline[0] |
| 200 | filter_contender = args.filter_contender[0] |
| 201 | |
| 202 | # NOTE: if test_baseline == test_contender and |
| 203 | # filter_baseline == filter_contender, you are analyzing the stdev |
| 204 | |
| 205 | description = 'Comparing %s (from %s) to %s (from %s)' % ( |
| 206 | filter_baseline, test_baseline, filter_contender, test_contender) |
| 207 | else: |
| 208 | # should never happen |
| 209 | print("Unrecognized mode of operation: '%s'" % args.mode) |
| 210 | parser.print_help() |
| 211 | exit(1) |
| 212 | |
| 213 | check_inputs(test_baseline, test_contender, benchmark_options) |
| 214 | |
| 215 | if args.display_aggregates_only: |
| 216 | benchmark_options += ['--benchmark_display_aggregates_only=true'] |
| 217 | |
| 218 | options_baseline = [] |
| 219 | options_contender = [] |
| 220 | |
| 221 | if filter_baseline and filter_contender: |
| 222 | options_baseline = ['--benchmark_filter=%s' % filter_baseline] |
| 223 | options_contender = ['--benchmark_filter=%s' % filter_contender] |
| 224 | |
| 225 | # Run the benchmarks and report the results |
| 226 | json1 = json1_orig = gbench.util.run_or_load_benchmark( |
| 227 | test_baseline, benchmark_options + options_baseline) |
| 228 | json2 = json2_orig = gbench.util.run_or_load_benchmark( |
| 229 | test_contender, benchmark_options + options_contender) |
| 230 | |
| 231 | # Now, filter the benchmarks so that the difference report can work |
| 232 | if filter_baseline and filter_contender: |
| 233 | replacement = '[%s vs. %s]' % (filter_baseline, filter_contender) |
| 234 | json1 = gbench.report.filter_benchmark( |
| 235 | json1_orig, filter_baseline, replacement) |
| 236 | json2 = gbench.report.filter_benchmark( |
| 237 | json2_orig, filter_contender, replacement) |
| 238 | |
| 239 | # Diff and output |
| 240 | output_lines = gbench.report.generate_difference_report( |
| 241 | json1, json2, args.display_aggregates_only, |
| 242 | args.utest, args.utest_alpha) |
| 243 | print(description) |
| 244 | for ln in output_lines: |
| 245 | print(ln) |
| 246 | |
| 247 | |
| 248 | class TestParser(unittest.TestCase): |
| 249 | def setUp(self): |
| 250 | self.parser = create_parser() |
| 251 | testInputs = os.path.join( |
| 252 | os.path.dirname( |
| 253 | os.path.realpath(__file__)), |
| 254 | 'gbench', |
| 255 | 'Inputs') |
| 256 | self.testInput0 = os.path.join(testInputs, 'test1_run1.json') |
| 257 | self.testInput1 = os.path.join(testInputs, 'test1_run2.json') |
| 258 | |
| 259 | def test_benchmarks_basic(self): |
| 260 | parsed = self.parser.parse_args( |
| 261 | ['benchmarks', self.testInput0, self.testInput1]) |
| 262 | self.assertFalse(parsed.display_aggregates_only) |
| 263 | self.assertTrue(parsed.utest) |
| 264 | self.assertEqual(parsed.mode, 'benchmarks') |
| 265 | self.assertEqual(parsed.test_baseline[0].name, self.testInput0) |
| 266 | self.assertEqual(parsed.test_contender[0].name, self.testInput1) |
| 267 | self.assertFalse(parsed.benchmark_options) |
| 268 | |
| 269 | def test_benchmarks_basic_without_utest(self): |
| 270 | parsed = self.parser.parse_args( |
| 271 | ['--no-utest', 'benchmarks', self.testInput0, self.testInput1]) |
| 272 | self.assertFalse(parsed.display_aggregates_only) |
| 273 | self.assertFalse(parsed.utest) |
| 274 | self.assertEqual(parsed.utest_alpha, 0.05) |
| 275 | self.assertEqual(parsed.mode, 'benchmarks') |
| 276 | self.assertEqual(parsed.test_baseline[0].name, self.testInput0) |
| 277 | self.assertEqual(parsed.test_contender[0].name, self.testInput1) |
| 278 | self.assertFalse(parsed.benchmark_options) |
| 279 | |
| 280 | def test_benchmarks_basic_display_aggregates_only(self): |
| 281 | parsed = self.parser.parse_args( |
| 282 | ['-a', 'benchmarks', self.testInput0, self.testInput1]) |
| 283 | self.assertTrue(parsed.display_aggregates_only) |
| 284 | self.assertTrue(parsed.utest) |
| 285 | self.assertEqual(parsed.mode, 'benchmarks') |
| 286 | self.assertEqual(parsed.test_baseline[0].name, self.testInput0) |
| 287 | self.assertEqual(parsed.test_contender[0].name, self.testInput1) |
| 288 | self.assertFalse(parsed.benchmark_options) |
| 289 | |
| 290 | def test_benchmarks_basic_with_utest_alpha(self): |
| 291 | parsed = self.parser.parse_args( |
| 292 | ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1]) |
| 293 | self.assertFalse(parsed.display_aggregates_only) |
| 294 | self.assertTrue(parsed.utest) |
| 295 | self.assertEqual(parsed.utest_alpha, 0.314) |
| 296 | self.assertEqual(parsed.mode, 'benchmarks') |
| 297 | self.assertEqual(parsed.test_baseline[0].name, self.testInput0) |
| 298 | self.assertEqual(parsed.test_contender[0].name, self.testInput1) |
| 299 | self.assertFalse(parsed.benchmark_options) |
| 300 | |
| 301 | def test_benchmarks_basic_without_utest_with_utest_alpha(self): |
| 302 | parsed = self.parser.parse_args( |
| 303 | ['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1]) |
| 304 | self.assertFalse(parsed.display_aggregates_only) |
| 305 | self.assertFalse(parsed.utest) |
| 306 | self.assertEqual(parsed.utest_alpha, 0.314) |
| 307 | self.assertEqual(parsed.mode, 'benchmarks') |
| 308 | self.assertEqual(parsed.test_baseline[0].name, self.testInput0) |
| 309 | self.assertEqual(parsed.test_contender[0].name, self.testInput1) |
| 310 | self.assertFalse(parsed.benchmark_options) |
| 311 | |
| 312 | def test_benchmarks_with_remainder(self): |
| 313 | parsed = self.parser.parse_args( |
| 314 | ['benchmarks', self.testInput0, self.testInput1, 'd']) |
| 315 | self.assertFalse(parsed.display_aggregates_only) |
| 316 | self.assertTrue(parsed.utest) |
| 317 | self.assertEqual(parsed.mode, 'benchmarks') |
| 318 | self.assertEqual(parsed.test_baseline[0].name, self.testInput0) |
| 319 | self.assertEqual(parsed.test_contender[0].name, self.testInput1) |
| 320 | self.assertEqual(parsed.benchmark_options, ['d']) |
| 321 | |
| 322 | def test_benchmarks_with_remainder_after_doubleminus(self): |
| 323 | parsed = self.parser.parse_args( |
| 324 | ['benchmarks', self.testInput0, self.testInput1, '--', 'e']) |
| 325 | self.assertFalse(parsed.display_aggregates_only) |
| 326 | self.assertTrue(parsed.utest) |
| 327 | self.assertEqual(parsed.mode, 'benchmarks') |
| 328 | self.assertEqual(parsed.test_baseline[0].name, self.testInput0) |
| 329 | self.assertEqual(parsed.test_contender[0].name, self.testInput1) |
| 330 | self.assertEqual(parsed.benchmark_options, ['e']) |
| 331 | |
| 332 | def test_filters_basic(self): |
| 333 | parsed = self.parser.parse_args( |
| 334 | ['filters', self.testInput0, 'c', 'd']) |
| 335 | self.assertFalse(parsed.display_aggregates_only) |
| 336 | self.assertTrue(parsed.utest) |
| 337 | self.assertEqual(parsed.mode, 'filters') |
| 338 | self.assertEqual(parsed.test[0].name, self.testInput0) |
| 339 | self.assertEqual(parsed.filter_baseline[0], 'c') |
| 340 | self.assertEqual(parsed.filter_contender[0], 'd') |
| 341 | self.assertFalse(parsed.benchmark_options) |
| 342 | |
| 343 | def test_filters_with_remainder(self): |
| 344 | parsed = self.parser.parse_args( |
| 345 | ['filters', self.testInput0, 'c', 'd', 'e']) |
| 346 | self.assertFalse(parsed.display_aggregates_only) |
| 347 | self.assertTrue(parsed.utest) |
| 348 | self.assertEqual(parsed.mode, 'filters') |
| 349 | self.assertEqual(parsed.test[0].name, self.testInput0) |
| 350 | self.assertEqual(parsed.filter_baseline[0], 'c') |
| 351 | self.assertEqual(parsed.filter_contender[0], 'd') |
| 352 | self.assertEqual(parsed.benchmark_options, ['e']) |
| 353 | |
| 354 | def test_filters_with_remainder_after_doubleminus(self): |
| 355 | parsed = self.parser.parse_args( |
| 356 | ['filters', self.testInput0, 'c', 'd', '--', 'f']) |
| 357 | self.assertFalse(parsed.display_aggregates_only) |
| 358 | self.assertTrue(parsed.utest) |
| 359 | self.assertEqual(parsed.mode, 'filters') |
| 360 | self.assertEqual(parsed.test[0].name, self.testInput0) |
| 361 | self.assertEqual(parsed.filter_baseline[0], 'c') |
| 362 | self.assertEqual(parsed.filter_contender[0], 'd') |
| 363 | self.assertEqual(parsed.benchmark_options, ['f']) |
| 364 | |
| 365 | def test_benchmarksfiltered_basic(self): |
| 366 | parsed = self.parser.parse_args( |
| 367 | ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e']) |
| 368 | self.assertFalse(parsed.display_aggregates_only) |
| 369 | self.assertTrue(parsed.utest) |
| 370 | self.assertEqual(parsed.mode, 'benchmarksfiltered') |
| 371 | self.assertEqual(parsed.test_baseline[0].name, self.testInput0) |
| 372 | self.assertEqual(parsed.filter_baseline[0], 'c') |
| 373 | self.assertEqual(parsed.test_contender[0].name, self.testInput1) |
| 374 | self.assertEqual(parsed.filter_contender[0], 'e') |
| 375 | self.assertFalse(parsed.benchmark_options) |
| 376 | |
| 377 | def test_benchmarksfiltered_with_remainder(self): |
| 378 | parsed = self.parser.parse_args( |
| 379 | ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f']) |
| 380 | self.assertFalse(parsed.display_aggregates_only) |
| 381 | self.assertTrue(parsed.utest) |
| 382 | self.assertEqual(parsed.mode, 'benchmarksfiltered') |
| 383 | self.assertEqual(parsed.test_baseline[0].name, self.testInput0) |
| 384 | self.assertEqual(parsed.filter_baseline[0], 'c') |
| 385 | self.assertEqual(parsed.test_contender[0].name, self.testInput1) |
| 386 | self.assertEqual(parsed.filter_contender[0], 'e') |
| 387 | self.assertEqual(parsed.benchmark_options[0], 'f') |
| 388 | |
| 389 | def test_benchmarksfiltered_with_remainder_after_doubleminus(self): |
| 390 | parsed = self.parser.parse_args( |
| 391 | ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g']) |
| 392 | self.assertFalse(parsed.display_aggregates_only) |
| 393 | self.assertTrue(parsed.utest) |
| 394 | self.assertEqual(parsed.mode, 'benchmarksfiltered') |
| 395 | self.assertEqual(parsed.test_baseline[0].name, self.testInput0) |
| 396 | self.assertEqual(parsed.filter_baseline[0], 'c') |
| 397 | self.assertEqual(parsed.test_contender[0].name, self.testInput1) |
| 398 | self.assertEqual(parsed.filter_contender[0], 'e') |
| 399 | self.assertEqual(parsed.benchmark_options[0], 'g') |
| 400 | |
| 401 | |
| 402 | if __name__ == '__main__': |
| 403 | # unittest.main() |
| 404 | main() |
| 405 | |
| 406 | # vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 |
| 407 | # kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off; |
| 408 | # kate: indent-mode python; remove-trailing-spaces modified; |