From 7dcffd3e9a166358562cbc8e6e4e8902bf808f25 Mon Sep 17 00:00:00 2001 From: Peter Thoman Date: Tue, 24 Oct 2023 15:53:24 +0200 Subject: [PATCH] CI: use benchmark groups when reporting results * Also only add benchmarks to the charts if there is a notable difference * Tweak thresholds to be a bit less sensitive * Improve script documentation --- ci/check-perf-impact.rb | 85 +++++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 20 deletions(-) diff --git a/ci/check-perf-impact.rb b/ci/check-perf-impact.rb index 59dc8ba5f..b786ab8e3 100755 --- a/ci/check-perf-impact.rb +++ b/ci/check-perf-impact.rb @@ -1,3 +1,16 @@ +# This script is used to check if a PR has a significant impact on the performance of the +# Celerity benchmarks. It is intended to be run as a GitHub action as part of CI. +# It expects benchmark results to be present in a csv file (see 'BENCH_FN' below). +# It will generate a message which can be posted as a comment to the PR, including a list of +# benchmarks which were significantly affected, and corresponding box plots. + +# Note that if you want to test it outside the GitHub action context, you need to provide some +# result data and a baseline branch name. You probably also want to clean up the generated pngs. +# Example run: +# $ cp build_release/bench_test.csv ci/perf/gpuc2_bench.csv +# $ GITHUB_BASE_REF=master ruby ci/check-perf-impact.rb +# $ rm box_*.png + require 'csv' require 'gruff' require 'json' @@ -7,6 +20,7 @@ BENCH_FN = 'ci/perf/gpuc2_bench.csv' NAME_COL_1 = "test case" # first name column NAME_COL_2 = "benchmark name" # second name column +TAG_COL = "tags" # tag column (quoted, comma separated list of tags) RAW_DATA_COL = "raw" # raw data column (array of runs) # customizing chart generation @@ -18,20 +32,28 @@ GRAPH_WIDTH = 1200 # width of the generated image THUMB_WIDTH = 120 # width of the thumbnail shown in the PR comment -# reporting thresholds -THRESHOLD_SLOW = 1.1 +# reporting thresholds for explicitly calling out benchmarks +THRESHOLD_SLOW = 1.25 THRESHOLD_FAST = 1 / THRESHOLD_SLOW +# graphing and per-group thresholds (should be smaller than the above) +MINOR_THRESHOLD_SLOW = 1.1 +MINOR_THRESHOLD_FAST = 1 / MINOR_THRESHOLD_SLOW # a small offset to reduce the reporting impact of really short-running (nanosecond-range) benchmarks -FLAT_THRESHOLD_OFFSET = 2 +FLAT_THRESHOLD_OFFSET = 10 MAX_BENCHMARKS_TO_LIST = 3 # if more than this number of benchmarks is affected, just report the count # file name for the message that will be posted MESSAGE_FN = "#{ENV['GITHUB_WORKSPACE']}/check_perf_message.txt" +# check if the expected files and env variables are present if !File.exists?(BENCH_FN) puts "Benchmark file #{BENCH_FN} not found.\nExecute this script from the repo root directory." exit(-1) end +if !ENV.key?("GITHUB_BASE_REF") + puts "GITHUB_BASE_REF needs to be set" + exit(-1) +end # prevent duplicate comments for the same benchmark results (by comparing csv hash) bench_file_digest = Digest::MD5.file(BENCH_FN).hexdigest @@ -58,7 +80,8 @@ exit end -# helper function which retrieves benchmark data from the csv at a specific version +# helper function which retrieves benchmark data from the csv at a specific git version (or the current one) +# returns a map from [category,benchmark_name] to raw data array, and the full data set def get_data_for_version(version = nil) # if a different version is supplied, check it out if version @@ -70,15 +93,23 @@ def get_data_for_version(version = nil) data = CSV.read(BENCH_FN, headers: true) bench_data_map = {} data.each do |row| + name = row[NAME_COL_1]+" / "+row[NAME_COL_2] + # determine the category of the benchmark + # (this is the tag which starts with "group:", without the prefix) + category = row[TAG_COL].split(",").find { |tag| tag.start_with?("group:") } + if category.nil? || category.empty? + puts "\e[33mWARNING\e[0m: benchmark #{name} has no category - tags: #{row[TAG_COL]}!" + category = "group:other" + end + category = category.delete_prefix("group:") raw_data = row[RAW_DATA_COL].delete_prefix('"').delete_suffix('"').split(",").map(&:to_f) - bench_data_map[row[NAME_COL_1]+" / "+row[NAME_COL_2]] = raw_data + bench_data_map[[category,name]] = raw_data end # restore the file if it was checked out in a different version if version `git restore --staged #{BENCH_FN}` `git restore #{BENCH_FN}` end - # return the benchmark data map, and full data set return bench_data_map, data end @@ -89,6 +120,9 @@ def get_data_for_version(version = nil) new_data_map, new_data = get_data_for_version() old_data_map, old_data = get_data_for_version("origin/" + base_ver) +# build a list of all categories in the new data +categories = new_data_map.keys.map { |k| k[0] }.uniq.sort + # statistics utilities def mean(array) return nil if array.empty? @@ -106,7 +140,7 @@ def scalar_add(array, val) significantly_slower_benchmarks = [] significantly_faster_benchmarks = [] -relative_times = [] +relative_times_per_category = Hash.new { |h,k| h[k] = [] } # perform further analysis and generate box plots if the data changed if new_data != old_data @@ -146,9 +180,14 @@ def get_wheel_color in_chart = false end - old_data_map.sort_by { |k,v| mean(v) }.each do |bench_name, old_bench_raw| + old_data_map.sort_by { |k,v| mean(v) }.each do |bench_key, old_bench_raw| # skip deleted benchmarks - next unless new_data_map.key?(bench_name) + next unless new_data_map.key?(bench_key) + + # gather some important information + new_bench_raw = new_data_map[bench_key] + bench_category = bench_key[0] + bench_name = bench_key[1] # finish the current chart if we have reached the maximum per image # or if the relative y axis difference becomes too large @@ -176,16 +215,11 @@ def get_wheel_color cur_chart_idx = 0 end - # add old and new boxes to chart - g.data bench_name, old_bench_raw, get_wheel_color - new_bench_raw = new_data_map[bench_name] - g.data nil, new_bench_raw, get_wheel_color - - # check if there was a significant difference + # check if there was a highly significant difference new_median = median(scalar_add(new_bench_raw, FLAT_THRESHOLD_OFFSET)) old_median = median(scalar_add(old_bench_raw, FLAT_THRESHOLD_OFFSET)) rel_difference = new_median / old_median - relative_times << rel_difference + relative_times_per_category[bench_category] << rel_difference # we output these for easy inspection in the CI log puts "%3.2f <= %s" % [rel_difference, bench_name] if rel_difference > THRESHOLD_SLOW @@ -196,8 +230,14 @@ def get_wheel_color significant_perf_improvement_in_this_chart = true end + # add old and new boxes to chart if they are significant according to the charting thresholds + if rel_difference > MINOR_THRESHOLD_SLOW || rel_difference < MINOR_THRESHOLD_FAST + g.data bench_name, old_bench_raw, get_wheel_color + g.data nil, new_bench_raw, get_wheel_color + cur_chart_idx += 1 + end + prev_mean = mean(old_bench_raw) - cur_chart_idx += 1 end # don't forget to finish the last chart! finish_chart.() @@ -219,11 +259,11 @@ def report_benchmark_list(list) message += "Please re-run the microbenchmarks and include the results if your commit could potentially affect performance." else if !significantly_slower_benchmarks.empty? - message += ":warning: Significant **slowdown** in some microbenchmark results: " + message += ":warning: Significant **slowdown** (>%3.2fx) in some microbenchmark results: " % THRESHOLD_SLOW message += report_benchmark_list(significantly_slower_benchmarks) + " \n" end if !significantly_faster_benchmarks.empty? - message += ":rocket: Significant **speedup** in some microbenchmark results: " + message += ":rocket: Significant **speedup** (<%3.2fx) in some microbenchmark results: " % THRESHOLD_FAST message += report_benchmark_list(significantly_faster_benchmarks) + " \n" end added_benchmarks = new_data_map.keys - old_data_map.keys @@ -239,7 +279,12 @@ def report_benchmark_list(list) if significantly_slower_benchmarks.empty? && significantly_faster_benchmarks.empty? message += ":heavy_check_mark: No significant performance change in the microbenchmark set. You are good to go! \n" end - message += "\nOverall relative execution time: **%3.2fx** (mean of relative medians)\n" % mean(relative_times) + message += "\nRelative execution time per category: (mean of relative medians)\n" + categories.each do |category| + category_mean = mean(relative_times_per_category[category]) + symbol = category_mean > MINOR_THRESHOLD_SLOW ? ":warning:" : (category_mean < MINOR_THRESHOLD_FAST ? ":rocket:" : "") + message += "* **#{category}** : **%3.2fx** %s\n" % [category_mean, symbol] + end end puts message