Skip to content

Commit

Permalink
CI: use benchmark groups when reporting results
Browse files Browse the repository at this point in the history
* Also only add benchmarks to the charts if there is a notable
  difference
* Tweak thresholds to be a bit less sensitive
* Improve script documentation
  • Loading branch information
PeterTh committed Oct 25, 2023
1 parent de1f900 commit 7dcffd3
Showing 1 changed file with 65 additions and 20 deletions.
85 changes: 65 additions & 20 deletions ci/check-perf-impact.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# This script is used to check if a PR has a significant impact on the performance of the
# Celerity benchmarks. It is intended to be run as a GitHub action as part of CI.
# It expects benchmark results to be present in a csv file (see 'BENCH_FN' below).
# It will generate a message which can be posted as a comment to the PR, including a list of
# benchmarks which were significantly affected, and corresponding box plots.

# Note that if you want to test it outside the GitHub action context, you need to provide some
# result data and a baseline branch name. You probably also want to clean up the generated pngs.
# Example run:
# $ cp build_release/bench_test.csv ci/perf/gpuc2_bench.csv
# $ GITHUB_BASE_REF=master ruby ci/check-perf-impact.rb
# $ rm box_*.png

require 'csv'
require 'gruff'
require 'json'
Expand All @@ -7,6 +20,7 @@
BENCH_FN = 'ci/perf/gpuc2_bench.csv'
NAME_COL_1 = "test case" # first name column
NAME_COL_2 = "benchmark name" # second name column
TAG_COL = "tags" # tag column (quoted, comma separated list of tags)
RAW_DATA_COL = "raw" # raw data column (array of runs)

# customizing chart generation
Expand All @@ -18,20 +32,28 @@
GRAPH_WIDTH = 1200 # width of the generated image
THUMB_WIDTH = 120 # width of the thumbnail shown in the PR comment

# reporting thresholds
THRESHOLD_SLOW = 1.1
# reporting thresholds for explicitly calling out benchmarks
THRESHOLD_SLOW = 1.25
THRESHOLD_FAST = 1 / THRESHOLD_SLOW
# graphing and per-group thresholds (should be smaller than the above)
MINOR_THRESHOLD_SLOW = 1.1
MINOR_THRESHOLD_FAST = 1 / MINOR_THRESHOLD_SLOW
# a small offset to reduce the reporting impact of really short-running (nanosecond-range) benchmarks
FLAT_THRESHOLD_OFFSET = 2
FLAT_THRESHOLD_OFFSET = 10
MAX_BENCHMARKS_TO_LIST = 3 # if more than this number of benchmarks is affected, just report the count

# file name for the message that will be posted
MESSAGE_FN = "#{ENV['GITHUB_WORKSPACE']}/check_perf_message.txt"

# check if the expected files and env variables are present
if !File.exists?(BENCH_FN)
puts "Benchmark file #{BENCH_FN} not found.\nExecute this script from the repo root directory."
exit(-1)
end
if !ENV.key?("GITHUB_BASE_REF")
puts "GITHUB_BASE_REF needs to be set"
exit(-1)
end

# prevent duplicate comments for the same benchmark results (by comparing csv hash)
bench_file_digest = Digest::MD5.file(BENCH_FN).hexdigest
Expand All @@ -58,7 +80,8 @@
exit
end

# helper function which retrieves benchmark data from the csv at a specific version
# helper function which retrieves benchmark data from the csv at a specific git version (or the current one)
# returns a map from [category,benchmark_name] to raw data array, and the full data set
def get_data_for_version(version = nil)
# if a different version is supplied, check it out
if version
Expand All @@ -70,15 +93,23 @@ def get_data_for_version(version = nil)
data = CSV.read(BENCH_FN, headers: true)
bench_data_map = {}
data.each do |row|
name = row[NAME_COL_1]+" / "+row[NAME_COL_2]
# determine the category of the benchmark
# (this is the tag which starts with "group:", without the prefix)
category = row[TAG_COL].split(",").find { |tag| tag.start_with?("group:") }
if category.nil? || category.empty?
puts "\e[33mWARNING\e[0m: benchmark #{name} has no category - tags: #{row[TAG_COL]}!"
category = "group:other"
end
category = category.delete_prefix("group:")
raw_data = row[RAW_DATA_COL].delete_prefix('"').delete_suffix('"').split(",").map(&:to_f)
bench_data_map[row[NAME_COL_1]+" / "+row[NAME_COL_2]] = raw_data
bench_data_map[[category,name]] = raw_data
end
# restore the file if it was checked out in a different version
if version
`git restore --staged #{BENCH_FN}`
`git restore #{BENCH_FN}`
end
# return the benchmark data map, and full data set
return bench_data_map, data
end

Expand All @@ -89,6 +120,9 @@ def get_data_for_version(version = nil)
new_data_map, new_data = get_data_for_version()
old_data_map, old_data = get_data_for_version("origin/" + base_ver)

# build a list of all categories in the new data
categories = new_data_map.keys.map { |k| k[0] }.uniq.sort

# statistics utilities
def mean(array)
return nil if array.empty?
Expand All @@ -106,7 +140,7 @@ def scalar_add(array, val)

significantly_slower_benchmarks = []
significantly_faster_benchmarks = []
relative_times = []
relative_times_per_category = Hash.new { |h,k| h[k] = [] }

# perform further analysis and generate box plots if the data changed
if new_data != old_data
Expand Down Expand Up @@ -146,9 +180,14 @@ def get_wheel_color
in_chart = false
end

old_data_map.sort_by { |k,v| mean(v) }.each do |bench_name, old_bench_raw|
old_data_map.sort_by { |k,v| mean(v) }.each do |bench_key, old_bench_raw|
# skip deleted benchmarks
next unless new_data_map.key?(bench_name)
next unless new_data_map.key?(bench_key)

# gather some important information
new_bench_raw = new_data_map[bench_key]
bench_category = bench_key[0]
bench_name = bench_key[1]

# finish the current chart if we have reached the maximum per image
# or if the relative y axis difference becomes too large
Expand Down Expand Up @@ -176,16 +215,11 @@ def get_wheel_color
cur_chart_idx = 0
end

# add old and new boxes to chart
g.data bench_name, old_bench_raw, get_wheel_color
new_bench_raw = new_data_map[bench_name]
g.data nil, new_bench_raw, get_wheel_color

# check if there was a significant difference
# check if there was a highly significant difference
new_median = median(scalar_add(new_bench_raw, FLAT_THRESHOLD_OFFSET))
old_median = median(scalar_add(old_bench_raw, FLAT_THRESHOLD_OFFSET))
rel_difference = new_median / old_median
relative_times << rel_difference
relative_times_per_category[bench_category] << rel_difference
# we output these for easy inspection in the CI log
puts "%3.2f <= %s" % [rel_difference, bench_name]
if rel_difference > THRESHOLD_SLOW
Expand All @@ -196,8 +230,14 @@ def get_wheel_color
significant_perf_improvement_in_this_chart = true
end

# add old and new boxes to chart if they are significant according to the charting thresholds
if rel_difference > MINOR_THRESHOLD_SLOW || rel_difference < MINOR_THRESHOLD_FAST
g.data bench_name, old_bench_raw, get_wheel_color
g.data nil, new_bench_raw, get_wheel_color
cur_chart_idx += 1
end

prev_mean = mean(old_bench_raw)
cur_chart_idx += 1
end
# don't forget to finish the last chart!
finish_chart.()
Expand All @@ -219,11 +259,11 @@ def report_benchmark_list(list)
message += "Please re-run the microbenchmarks and include the results if your commit could potentially affect performance."
else
if !significantly_slower_benchmarks.empty?
message += ":warning: Significant **slowdown** in some microbenchmark results: "
message += ":warning: Significant **slowdown** (>%3.2fx) in some microbenchmark results: " % THRESHOLD_SLOW
message += report_benchmark_list(significantly_slower_benchmarks) + " \n"
end
if !significantly_faster_benchmarks.empty?
message += ":rocket: Significant **speedup** in some microbenchmark results: "
message += ":rocket: Significant **speedup** (<%3.2fx) in some microbenchmark results: " % THRESHOLD_FAST
message += report_benchmark_list(significantly_faster_benchmarks) + " \n"
end
added_benchmarks = new_data_map.keys - old_data_map.keys
Expand All @@ -239,7 +279,12 @@ def report_benchmark_list(list)
if significantly_slower_benchmarks.empty? && significantly_faster_benchmarks.empty?
message += ":heavy_check_mark: No significant performance change in the microbenchmark set. You are good to go! \n"
end
message += "\nOverall relative execution time: **%3.2fx** (mean of relative medians)\n" % mean(relative_times)
message += "\nRelative execution time per category: (mean of relative medians)\n"
categories.each do |category|
category_mean = mean(relative_times_per_category[category])
symbol = category_mean > MINOR_THRESHOLD_SLOW ? ":warning:" : (category_mean < MINOR_THRESHOLD_FAST ? ":rocket:" : "")
message += "* **#{category}** : **%3.2fx** %s\n" % [category_mean, symbol]
end
end
puts message

Expand Down

0 comments on commit 7dcffd3

Please sign in to comment.