CI: use benchmark groups when reporting results

* Also only add benchmarks to the charts if there is a notable difference * Tweak thresholds to be a bit less sensitive * Improve script documentation
celerity · Oct 25, 2023 · 57d07e5 · 57d07e5
1 parent 5cc76fa
commit 57d07e5
Showing 1 changed file with 65 additions and 20 deletions.
diff --git a/ci/check-perf-impact.rb b/ci/check-perf-impact.rb
@@ -1,3 +1,16 @@
+# This script is used to check if a PR has a significant impact on the performance of the
+# Celerity benchmarks. It is intended to be run as a GitHub action as part of CI.
+# It expects benchmark results to be present in a csv file (see 'BENCH_FN' below).
+# It will generate a message which can be posted as a comment to the PR, including a list of
+# benchmarks which were significantly affected, and corresponding box plots.
+
+# Note that if you want to test it outside the GitHub action context, you need to provide some
+# result data and a baseline branch name. You probably also want to clean up the generated pngs.
+# Example run:
+# $ cp build_release/bench_test.csv ci/perf/gpuc2_bench.csv
+# $ GITHUB_BASE_REF=master ruby ci/check-perf-impact.rb
+# $ rm box_*.png
+
 require 'csv'
 require 'gruff'
 require 'json'
@@ -7,6 +20,7 @@
 BENCH_FN = 'ci/perf/gpuc2_bench.csv'
 NAME_COL_1 = "test case"       # first name column
 NAME_COL_2 = "benchmark name"  # second name column
+TAG_COL = "tags"               # tag column (quoted, comma separated list of tags)
 RAW_DATA_COL = "raw"           # raw data column (array of runs)
 
 # customizing chart generation
@@ -18,20 +32,28 @@
 GRAPH_WIDTH = 1200          # width of the generated image
 THUMB_WIDTH = 120           # width of the thumbnail shown in the PR comment
 
-# reporting thresholds
-THRESHOLD_SLOW = 1.1
+# reporting thresholds for explicitly calling out benchmarks
+THRESHOLD_SLOW = 1.25
 THRESHOLD_FAST = 1 / THRESHOLD_SLOW
+# graphing and per-group thresholds (should be smaller than the above)
+MINOR_THRESHOLD_SLOW = 1.1
+MINOR_THRESHOLD_FAST = 1 / MINOR_THRESHOLD_SLOW
 # a small offset to reduce the reporting impact of really short-running (nanosecond-range) benchmarks
-FLAT_THRESHOLD_OFFSET = 2
+FLAT_THRESHOLD_OFFSET = 10
 MAX_BENCHMARKS_TO_LIST = 3  # if more than this number of benchmarks is affected, just report the count
 
 # file name for the message that will be posted
 MESSAGE_FN = "#{ENV['GITHUB_WORKSPACE']}/check_perf_message.txt"
 
+# check if the expected files and env variables are present
 if !File.exists?(BENCH_FN)
   puts "Benchmark file #{BENCH_FN} not found.\nExecute this script from the repo root directory."
   exit(-1)
 end
+if !ENV.key?("GITHUB_BASE_REF")
+  puts "GITHUB_BASE_REF needs to be set"
+  exit(-1)
+end
 
 # prevent duplicate comments for the same benchmark results (by comparing csv hash)
 bench_file_digest = Digest::MD5.file(BENCH_FN).hexdigest
@@ -58,7 +80,8 @@
   exit
 end
 
-# helper function which retrieves benchmark data from the csv at a specific version
+# helper function which retrieves benchmark data from the csv at a specific git version (or the current one)
+# returns a map from [category,benchmark_name] to raw data array, and the full data set
 def get_data_for_version(version = nil)
   # if a different version is supplied, check it out
   if version
@@ -70,15 +93,23 @@ def get_data_for_version(version = nil)
   data = CSV.read(BENCH_FN, headers: true)
   bench_data_map = {}
   data.each do |row|
+    name = row[NAME_COL_1]+" / "+row[NAME_COL_2]
+    # determine the category of the benchmark
+    # (this is the tag which starts with "group:", without the prefix)
+    category = row[TAG_COL].split(",").find { |tag| tag.start_with?("group:") }
+    if category.nil? || category.empty?
+      puts "\e[33mWARNING\e[0m: benchmark #{name} has no category - tags: #{row[TAG_COL]}!"
+      category = "group:other"
+    end
+    category = category.delete_prefix("group:")
     raw_data = row[RAW_DATA_COL].delete_prefix('"').delete_suffix('"').split(",").map(&:to_f)
-    bench_data_map[row[NAME_COL_1]+" / "+row[NAME_COL_2]] = raw_data
+    bench_data_map[[category,name]] = raw_data
   end
   # restore the file if it was checked out in a different version
   if version
     `git restore --staged #{BENCH_FN}`
     `git restore #{BENCH_FN}`
   end
-  # return the benchmark data map, and full data set
   return bench_data_map, data
 end
 
@@ -89,6 +120,9 @@ def get_data_for_version(version = nil)
 new_data_map, new_data = get_data_for_version()
 old_data_map, old_data = get_data_for_version("origin/" + base_ver)
 
+# build a list of all categories in the new data
+categories = new_data_map.keys.map { |k| k[0] }.uniq.sort
+
 # statistics utilities
 def mean(array)
   return nil if array.empty?
@@ -106,7 +140,7 @@ def scalar_add(array, val)
 
 significantly_slower_benchmarks = []
 significantly_faster_benchmarks = []
-relative_times = []
+relative_times_per_category = Hash.new { |h,k| h[k] = [] }
 
 # perform further analysis and generate box plots if the data changed
 if new_data != old_data
@@ -146,9 +180,14 @@ def get_wheel_color
     in_chart = false
   end
 
-  old_data_map.sort_by { |k,v| mean(v) }.each do |bench_name, old_bench_raw|
+  old_data_map.sort_by { |k,v| mean(v) }.each do |bench_key, old_bench_raw|
     # skip deleted benchmarks
-    next unless new_data_map.key?(bench_name)
+    next unless new_data_map.key?(bench_key)
+
+    # gather some important information
+    new_bench_raw = new_data_map[bench_key]
+    bench_category = bench_key[0]
+    bench_name = bench_key[1]
 
     # finish the current chart if we have reached the maximum per image
     # or if the relative y axis difference becomes too large
@@ -176,16 +215,11 @@ def get_wheel_color
       cur_chart_idx = 0
     end
 
-    # add old and new boxes to chart
-    g.data bench_name, old_bench_raw, get_wheel_color
-    new_bench_raw = new_data_map[bench_name]
-    g.data nil, new_bench_raw, get_wheel_color
-
-    # check if there was a significant difference
+    # check if there was a highly significant difference
     new_median = median(scalar_add(new_bench_raw, FLAT_THRESHOLD_OFFSET))
     old_median = median(scalar_add(old_bench_raw, FLAT_THRESHOLD_OFFSET))
     rel_difference = new_median / old_median
-    relative_times << rel_difference
+    relative_times_per_category[bench_category] << rel_difference
     # we output these for easy inspection in the CI log
     puts "%3.2f <= %s" % [rel_difference, bench_name]
     if rel_difference > THRESHOLD_SLOW
@@ -196,8 +230,14 @@ def get_wheel_color
       significant_perf_improvement_in_this_chart = true
     end
 
+    # add old and new boxes to chart if they are significant according to the charting thresholds
+    if rel_difference > MINOR_THRESHOLD_SLOW || rel_difference < MINOR_THRESHOLD_FAST
+      g.data bench_name, old_bench_raw, get_wheel_color
+      g.data nil, new_bench_raw, get_wheel_color
+      cur_chart_idx += 1
+    end
+
     prev_mean = mean(old_bench_raw)
-    cur_chart_idx += 1
   end
   # don't forget to finish the last chart!
   finish_chart.()
@@ -219,11 +259,11 @@ def report_benchmark_list(list)
   message += "Please re-run the microbenchmarks and include the results if your commit could potentially affect performance."
 else
   if !significantly_slower_benchmarks.empty?
-    message += ":warning: Significant **slowdown** in some microbenchmark results: "
+    message += ":warning: Significant **slowdown** (>%3.2fx) in some microbenchmark results: " % THRESHOLD_SLOW
     message += report_benchmark_list(significantly_slower_benchmarks) + "  \n"
   end
   if !significantly_faster_benchmarks.empty?
-    message += ":rocket: Significant **speedup** in some microbenchmark results: "
+    message += ":rocket: Significant **speedup** (<%3.2fx) in some microbenchmark results: " % THRESHOLD_FAST
     message += report_benchmark_list(significantly_faster_benchmarks) + "  \n"
   end
   added_benchmarks = new_data_map.keys - old_data_map.keys
@@ -239,7 +279,12 @@ def report_benchmark_list(list)
   if significantly_slower_benchmarks.empty? && significantly_faster_benchmarks.empty?
     message += ":heavy_check_mark: No significant performance change in the microbenchmark set. You are good to go!  \n"
   end
-  message += "\nOverall relative execution time: **%3.2fx** (mean of relative medians)\n" % mean(relative_times)
+  message += "\nRelative execution time per category: (mean of relative medians)\n"
+  categories.each do |category|
+    category_mean = mean(relative_times_per_category[category])
+    symbol = category_mean > MINOR_THRESHOLD_SLOW ? ":warning:" : (category_mean < MINOR_THRESHOLD_FAST ? ":rocket:" : "")
+    message += "* **#{category}** : **%3.2fx** %s\n" % [category_mean, symbol]
+  end
 end
 puts message