mojo_benchmark: aggregate results over multiple runs.
This patch allows to pass `--aggregate N` to `mojo_benchmark`, causing
the script to run each benchmark N times and aggregate the results.
When uploading results to the dashboard, a full vector of all results on
individual runs is uploaded, so that statistics (e.g. std dev) are
visible in the dashboard.
Fixes https://github.com/domokit/devtools/issues/54.
R=qsr@chromium.org
Review URL: https://codereview.chromium.org/1433693004 .
Cr-Mirrored-From: https://github.com/domokit/mojo
Cr-Mirrored-Commit: 574e2347ec780280488c5ef64a803d5c33f06c4d
diff --git a/devtoolslib/benchmark.py b/devtoolslib/benchmark.py
index 3383dd4..e55a884 100644
--- a/devtoolslib/benchmark.py
+++ b/devtoolslib/benchmark.py
@@ -43,14 +43,18 @@
return measurement_results
-class Results(object):
+class Outcome(object):
"""Holds results of a benchmark run."""
def __init__(self, succeeded, error_str, output):
self.succeeded = succeeded
self.error_str = error_str
self.output = output
- self.measurements = None
+ # Maps measurement specs to measurement results given as floats. Only
+ # measurements that succeeded (ie. we retrieved their results) are
+ # represented.
+ self.results = {}
+ self.some_measurements_failed = False
def run(shell, shell_args, app, duration_seconds, measurements, verbose,
@@ -59,7 +63,7 @@
appropriate arguments and returns the produced output.
Returns:
- A tuple of (succeeded, error_msg, output).
+ An instance of Outcome holding the results of the run.
"""
timeout = duration_seconds + _EXTRA_TIMEOUT
benchmark_args = []
@@ -89,15 +93,21 @@
shell_args, timeout=timeout)
if did_time_out:
- return Results(False, 'timed out', output)
+ return Outcome(False, 'timed out', output)
if return_code:
- return Results(False, 'return code: ' + str(return_code), output)
+ return Outcome(False, 'return code: ' + str(return_code), output)
# Pull the trace file even if some measurements are missing, as it can be
# useful in debugging.
if device_output_file:
shell.pull_file(device_output_file, output_file, remove_original=True)
- results = Results(True, None, output)
- results.measurements = _parse_measurement_results(output)
- return results
+ outcome = Outcome(True, None, output)
+ parsed_results = _parse_measurement_results(output)
+ for measurement in measurements:
+ spec = measurement['spec']
+ if spec in parsed_results:
+ outcome.results[spec] = parsed_results[spec]
+ else:
+ outcome.some_measurements_failed = True
+ return outcome
diff --git a/devtoolslib/perf_dashboard.py b/devtoolslib/perf_dashboard.py
index f203edf..d5388c3 100644
--- a/devtoolslib/perf_dashboard.py
+++ b/devtoolslib/perf_dashboard.py
@@ -40,6 +40,15 @@
'units': units,
'value': value}
+ def record_vector(self, chart_name, value_name, units, values):
+ """Records a single measurement value of a list of scalars type."""
+ if chart_name not in self.charts:
+ self.charts[chart_name] = {}
+ self.charts[chart_name][value_name] = {
+ 'type': 'list_of_scalar_values',
+ 'units': units,
+ 'values': values}
+
def get_chart_data(self):
"""Returns the JSON string representing the recorded chart data, wrapping
it with the required meta data."""
diff --git a/devtoolslib/perf_dashboard_unittest.py b/devtoolslib/perf_dashboard_unittest.py
index 98714c9..efa8105 100644
--- a/devtoolslib/perf_dashboard_unittest.py
+++ b/devtoolslib/perf_dashboard_unittest.py
@@ -73,3 +73,26 @@
'type': 'scalar',
'units': 'ms',
'value': 2}, charts['chart2']['val2'])
+
+ def test_vectors(self):
+ """Test recording a list of scalar values."""
+ recorder = ChartDataRecorder('benchmark')
+ recorder.record_vector('chart1', 'val1', 'ms', [1, 2])
+ recorder.record_vector('chart2', 'val2', 'ms', [])
+
+ result = recorder.get_chart_data()
+ self.assertEquals('1.0', result['format_version'])
+ self.assertEquals('benchmark', result['benchmark_name'])
+
+ charts = result['charts']
+ self.assertEquals(2, len(charts))
+ self.assertEquals(1, len(charts['chart1']))
+ self.assertEquals({
+ 'type': 'list_of_scalar_values',
+ 'units': 'ms',
+ 'values': [1, 2]}, charts['chart1']['val1'])
+ self.assertEquals(1, len(charts['chart2']))
+ self.assertEquals({
+ 'type': 'list_of_scalar_values',
+ 'units': 'ms',
+ 'values': []}, charts['chart2']['val2'])
diff --git a/mojo_benchmark b/mojo_benchmark
index b6c17ae..9a4803c 100755
--- a/mojo_benchmark
+++ b/mojo_benchmark
@@ -59,12 +59,90 @@
return variants
+def _print_benchmark_error(outcome):
+ if not outcome.succeeded:
+ print 'benchmark failed: ' + outcome.error_str
+ if outcome.some_measurements_failed:
+ print 'some measurements failed'
+ print 'output: '
+ print '-' * 72
+ print outcome.output
+ print '-' * 72
+
+
+def _print_results(benchmark_name, variant_name, results, measurements,
+ aggregate):
+ print '[ %s ] %s ' % (benchmark_name, variant_name)
+ for measurement in measurements:
+ print ' ' + measurement['name'] + ': ',
+ if measurement['spec'] in results:
+ if aggregate:
+ print str(results[measurement['spec']])
+ else:
+ if len(results[measurement['spec']]) == 0:
+ print '?'
+ else:
+ print '%f' % results[measurement['spec']][0]
+ else:
+ print '?'
+
+
+def _upload_results(benchmark_name, variant_name, results, measurements,
+ script_args):
+ anything_recorded = False
+ chart_data_recorder = perf_dashboard.ChartDataRecorder(script_args.test_name)
+ chart_name = benchmark_name + '__' + variant_name
+
+ for measurement in measurements:
+ if measurement['spec'] in results:
+ if not results[measurement['spec']]:
+ continue
+
+ if script_args.aggregate:
+ chart_data_recorder.record_vector(
+ perf_dashboard.normalize_label(chart_name),
+ perf_dashboard.normalize_label(measurement['name']),
+ 'ms', results[measurement['spec']])
+ else:
+ chart_data_recorder.record_scalar(
+ perf_dashboard.normalize_label(chart_name),
+ perf_dashboard.normalize_label(measurement['name']),
+ 'ms', results[measurement['spec']][0])
+ anything_recorded = True
+
+ if not anything_recorded:
+ # Don't upload empty packets, see
+ # https://github.com/catapult-project/catapult/issues/1733 .
+ return True
+
+ return perf_dashboard.upload_chart_data(
+ script_args.master_name, script_args.bot_name,
+ script_args.test_name, script_args.builder_name,
+ script_args.build_number, chart_data_recorder.get_chart_data(),
+ script_args.server_url, script_args.dry_run)
+
+
+def _argparse_aggregate_type(value):
+ try:
+ cast_value = int(value)
+ except ValueError:
+ raise argparse.ArgumentTypeError('value is not a positive integer')
+
+ if cast_value < 1:
+ raise argparse.ArgumentTypeError('value is not a positive integer')
+ return cast_value
+
+
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=_DESCRIPTION)
parser.add_argument('benchmark_list_file', type=file,
help='a file listing benchmarks to run')
+ parser.add_argument('--aggregate', type=_argparse_aggregate_type,
+ help='aggregate results over multiple runs. The value '
+ 'has to be a positive integer indicating the number of '
+ 'runs.')
parser.add_argument('--save-all-traces', action='store_true',
help='save the traces produced by benchmarks to disk')
perf_dashboard.add_argparse_server_arguments(parser)
@@ -85,72 +163,55 @@
exec script_args.benchmark_list_file in benchmark_list_params
exit_code = 0
+ run_count = script_args.aggregate if script_args.aggregate else 1
for benchmark_spec in benchmark_list_params['benchmarks']:
benchmark_name = benchmark_spec['name']
+ variants = _generate_benchmark_variants(benchmark_spec)
+ variant_results = {variant_spec['variant_name']: {}
+ for variant_spec in variants}
- for variant_spec in _generate_benchmark_variants(benchmark_spec):
+ for _ in xrange(run_count):
+ for variant_spec in variants:
+ variant_name = variant_spec['variant_name']
+ app = variant_spec['app']
+ duration = variant_spec['duration']
+ shell_args = variant_spec.get('shell-args', []) + common_shell_args
+ measurements = variant_spec['measurements']
+
+ output_file = None
+ if script_args.save_all_traces:
+ output_file = 'benchmark-%s-%s-%s.trace' % (
+ benchmark_name.replace(' ', '_'),
+ variant_name.replace(' ', '_'),
+ time.strftime('%Y%m%d%H%M%S'))
+
+ outcome = benchmark.run(
+ shell, shell_args, app, duration, measurements, script_args.verbose,
+ script_args.android, output_file)
+
+ if not outcome.succeeded or outcome.some_measurements_failed:
+ _print_benchmark_error(outcome)
+ exit_code = 1
+
+ if outcome.succeeded:
+ for measurement_spec in outcome.results:
+ if measurement_spec not in variant_results[variant_name]:
+ variant_results[variant_name][measurement_spec] = []
+ variant_results[variant_name][measurement_spec].append(
+ outcome.results[measurement_spec])
+
+ for variant_spec in variants:
variant_name = variant_spec['variant_name']
- app = variant_spec['app']
- duration = variant_spec['duration']
- shell_args = variant_spec.get('shell-args', []) + common_shell_args
- measurements = variant_spec['measurements']
+ _print_results(benchmark_name, variant_name,
+ variant_results[variant_name],
+ variant_spec['measurements'], script_args.aggregate)
- output_file = None
- if script_args.save_all_traces:
- output_file = 'benchmark-%s-%s-%s.trace' % (
- benchmark_name.replace(' ', '_'),
- variant_name.replace(' ', '_'),
- time.strftime('%Y%m%d%H%M%S'))
-
- chart_data_recorder = None
if script_args.upload:
- chart_data_recorder = perf_dashboard.ChartDataRecorder(
- script_args.test_name)
-
- results = benchmark.run(
- shell, shell_args, app, duration, measurements, script_args.verbose,
- script_args.android, output_file)
-
- print '[ %s ] %s ' % (benchmark_name, variant_name)
-
- some_measurements_failed = False
- some_measurements_succeeded = False
- if results.succeeded:
- # Iterate over the list of specs, not the dictionary, to detect missing
- # results and preserve the required order.
- for measurement in measurements:
- if measurement['spec'] in results.measurements:
- result = results.measurements[measurement['spec']]
- print '%10.4f %s' % (result, measurement['name'])
-
- if chart_data_recorder:
- chart_name = benchmark_name + '__' + variant_name
- chart_data_recorder.record_scalar(
- perf_dashboard.normalize_label(chart_name),
- perf_dashboard.normalize_label(measurement['name']),
- 'ms', result)
- some_measurements_succeeded = True
- else:
- print '? %s' % measurement['name']
- some_measurements_failed = True
-
- if not results.succeeded or some_measurements_failed:
- if not results.succeeded:
- print 'benchmark failed: ' + results.error_str
- if some_measurements_failed:
- print 'some measurements failed'
- print 'output: '
- print '-' * 72
- print results.output
- print '-' * 72
- exit_code = 1
-
- if script_args.upload and some_measurements_succeeded:
- if not perf_dashboard.upload_chart_data(
- script_args.master_name, script_args.bot_name,
- script_args.test_name, script_args.builder_name,
- script_args.build_number, chart_data_recorder.get_chart_data(),
- script_args.server_url, script_args.dry_run):
+ upload_succeeded = _upload_results(benchmark_name, variant_name,
+ variant_results[variant_name],
+ variant_spec['measurements'],
+ script_args)
+ if not upload_succeeded:
exit_code = 1
return exit_code