Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for website image generation #12

Merged
merged 2 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: rren
Title: Renaissance Benchmark Data Processing Package
Version: 0.0.7
Version: 0.0.8
Authors@R:
person ("Petr", "Tuma", email = "[email protected]", role = c ("aut", "cre"), comment = c (ORCID = "0000-0002-7035-2322"))
Description: A utility package for processing the data produced by the Renaissance benchmark suite (https://renaissance.dev).
Expand All @@ -19,11 +19,14 @@ Imports:
cli (>= 3.6.1),
digest (>= 0.6.33),
dplyr (>= 1.1.3),
forcats (>= 1.0.0),
fs (>= 1.6.3),
ggplot2 (>= 3.4.4),
glue (>= 1.6.2),
jsonlite (>= 1.8.7),
logger (>= 0.2.2),
prettyunits (>= 1.2.0),
readr (>= 2.1.4),
rlang (>= 1.1.1),
stringr (>= 1.5.0),
tibble (>= 3.2.1),
Expand Down
9 changes: 9 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
export(assert_renaissance)
export(check_renaissance)
export(compute_flat_percentile_ci)
export(compute_flat_relative_percentile_ci)
export(compute_hierarchical_percentile_ci)
export(compute_hierarchical_relative_percentile_ci)
export(compute_list_vectors_hierarchical_percentile_ci)
export(compute_list_vectors_pair_hierarchical_percentile_ci)
export(compute_trends)
export(compute_vector_flat_percentile_ci)
export(compute_vector_pair_flat_percentile_ci)
export(expect_renaissance)
export(flag_outliers_global)
export(flag_outliers_window)
Expand All @@ -21,6 +25,9 @@ export(list_segment_boundaries)
export(load_file_json)
export(load_path_json)
export(locate_vector_segments)
export(plot_default_labels)
export(plot_website_stripes)
export(plot_website_violins)
export(preserve_last_n)
export(preserve_last_sec)
export(remove_outliers_global)
Expand All @@ -31,6 +38,8 @@ export(rren_example)
import(checkmate)
import(cli)
import(dplyr)
import(forcats)
import(ggplot2)
import(glue)
import(logger)
import(prettyunits)
Expand Down
34 changes: 20 additions & 14 deletions R/example.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,33 +13,39 @@ rren_example <- function (file_name = '') {

#' Return data frame with artificial data.
#'
#' @param data Data to use.
#' @param vm_count Count of VM combinations to generate.
#' @param run_count Count of run combinations to generate.
#' @param benchmark_count Count of benchmark combinations to generate.
#' @return data Frame with artificial data.
#' Data to use for each run is the same. The result tibble
#' is arranged to first iterate runs, then benchmarks,
#' then virtual machines. This guarantee is useful
#' when difference between runs is to be
#' injected later.
#'
#' @param data Data to use for each run.
#' @param run_count Count of run combinations to generate per virtual machine and benchmark.
#' @param benchmark_count Count of benchmark combinations to generate per virtual machine.
#' @param vm_count Count of virtual machine combinations to generate.
#' @return data Tibble with artificial data.
#' @export
rren_artificial <- function (data, vm_count = 1L, run_count = 1L, benchmark_count = 1L) {
rren_artificial <- function (data, run_count = 1L, benchmark_count = 1L, vm_count = 1L) {

result <- tibble (time = data) |>
mutate (
index = row_number (),
total = cumsum (data))

list_times_run <- lapply (seq (run_count), function (x) result |> mutate (run = factor (glue::glue ('Run {x}'))))
result_times_run <- bind_rows (list_times_run)

list_times_benchmark <- lapply (seq (benchmark_count), function (x) result_times_run |> mutate (benchmark = factor (glue::glue ('Benchmark {x}'))))
result_times_benchmark <- bind_rows (list_times_benchmark)

list_times_vm <- lapply (seq (vm_count),
function (x) result |> mutate (
function (x) result_times_benchmark |> mutate (
vm_name = factor (glue::glue ('Name {x}')),
vm_version = factor (glue::glue ('Version {x}')),
vm_configuration = factor (glue::glue ('Configuration {x}')),
vm = factor (digest::digest (c (.data $ vm_name, .data $ vm_version, .data $ vm_configuration), algo = 'murmur32'))))

result_times_vm <- bind_rows (list_times_vm)

list_times_run <- lapply (seq (run_count), function (x) result_times_vm |> mutate (run = factor (glue::glue ('Run {x}'))))
result_times_run <- bind_rows (list_times_run)

list_times_benchmark <- lapply (seq (benchmark_count), function (x) result_times_run |> mutate (benchmark = factor (glue::glue ('Benchmark {x}'))))
result_times_benchmark <- bind_rows (list_times_benchmark)

return (result_times_benchmark)
return (result_times_vm)
}
4 changes: 2 additions & 2 deletions R/filter.R
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@ identify_vector_outliers_window <- function (.input, .window = 333, .limit = 0.0
limits_hi <- limits [2, ] + ranges * .slack

# Stretch limits across border samples.
limits_lo <- c (rep.int (first (limits_lo), floor ((.window - 1) / 2)), limits_lo, rep (last (limits_lo), ceiling ((.window - 1) / 2)))
limits_hi <- c (rep.int (first (limits_hi), floor ((.window - 1) / 2)), limits_hi, rep (last (limits_hi), ceiling ((.window - 1) / 2)))
limits_lo <- c (rep.int (first (limits_lo), floor ((.window - 1) / 2)), limits_lo, rep.int (last (limits_lo), ceiling ((.window - 1) / 2)))
limits_hi <- c (rep.int (first (limits_hi), floor ((.window - 1) / 2)), limits_hi, rep.int (last (limits_hi), ceiling ((.window - 1) / 2)))

return (.input < limits_lo | .input > limits_hi)
}
Expand Down
166 changes: 147 additions & 19 deletions R/interval.R
Original file line number Diff line number Diff line change
@@ -1,34 +1,42 @@
# ----------------------------------------------------------------
# Bootstrap

bootstrap_replicates_vector <- function (.input, .statistic, .R) {
# Sample one less than length to compensate variability bias.
# This may not make much sense for some statistics ?
length_reduced <- length (.input) - 1
bootstrap_replicates_vector <- function (.input, .statistic, .R, .reduction) {

# Reduction in sample size can help compensate bootstrap standard error bias.
length_original <- length (.input)
length_reduced <- length (.input) - .reduction

result <- rep.int (NA, .R)
# TODO This can be made (much) faster by blocking.
for (counter in seq.int (1, .R)) {
indices <- sample (length_reduced, replace = TRUE)
indices <- sample (length_original, length_reduced, replace = TRUE)
samples <- .input [indices]
result [counter] <- .statistic (samples)
}

return (result)
}


bootstrap_replicates_list_vectors <- function (.input, .statistic_outer, .statistic_inner, .R) {
# Sample one less than length to compensate variability bias.
# This may not make much sense for some statistics ?
length_reduced <- length (.input) - 1
if (length_reduced < 1) {
bootstrap_replicates_list_vectors <- function (.input, .statistic_outer, .statistic_inner, .R, .reduction) {

# Reduction in sample size can help compensate bootstrap standard error bias.
length_original <- length (.input)
length_reduced <- length (.input) - .reduction

if (length_original == 1) {

# Just one outer level, perform one level bootstrap.
result <- bootstrap_replicates_vector (.input [[1]], function (x) .statistic_outer (.statistic_inner (x)), .R)
# Note that this does not reduce sample size in inner level.
result <- bootstrap_replicates_vector (.input [[1]], function (x) .statistic_outer (.statistic_inner (x)), .R, 0)

} else {
# More outer levels, perform two level bootstrap.
result <- rep.int (NA, .R)
for (counter_result in seq.int (1, .R)) {
# TODO This can be made (much) faster by blocking.
indices_outer <- sample (length_reduced, replace = TRUE)
indices_outer <- sample (length_original, length_reduced, replace = TRUE)
samples_outer <- rep.int (NA, length_reduced)
for (counter_outer in seq.int (1, length_reduced)) {
index_outer <- indices_outer [counter_outer]
Expand All @@ -39,6 +47,7 @@ bootstrap_replicates_list_vectors <- function (.input, .statistic_outer, .statis
result [counter_result] <- .statistic_outer (samples_outer)
}
}

return (result)
}

Expand All @@ -52,11 +61,12 @@ bootstrap_replicates_list_vectors <- function (.input, .statistic_outer, .statis
#' @param .statistic Statistic to compute confidence interval for.
#' @param .confidence Confidence level.
#' @param .R Number of replicates.
#' @param .reduction Replicate size reduction.
#' @return Tibble with confidence interval.
#' @export
compute_vector_flat_percentile_ci <- function (.input, .statistic = mean, .confidence = 0.99, .R = 10000) {
compute_vector_flat_percentile_ci <- function (.input, .statistic = mean, .confidence = 0.99, .R = 10000, .reduction = 0) {

replicates <- bootstrap_replicates_vector (.input, .statistic, .R)
replicates <- bootstrap_replicates_vector (.input, .statistic, .R, .reduction)
interval <- stats::quantile (replicates, probs = c ((1 - .confidence)/2, (1 + .confidence)/2))
middle <- .statistic (.input)

Expand All @@ -71,18 +81,68 @@ compute_vector_flat_percentile_ci <- function (.input, .statistic = mean, .confi
#' @param .statistic_inner Statistic to apply at the inner leve.
#' @param .confidence Confidence level.
#' @param .R Number of replicates.
#' @param .reduction Replicate size reduction.
#' @return Tibble with confidence interval.
#' @export
compute_list_vectors_hierarchical_percentile_ci <- function (.input, .statistic_outer = mean, .statistic_inner = mean, .confidence = 0.99, .R = 10000) {
compute_list_vectors_hierarchical_percentile_ci <- function (.input, .statistic_outer = mean, .statistic_inner = mean, .confidence = 0.99, .R = 10000, .reduction = 0) {

replicates <- bootstrap_replicates_list_vectors (.input, .statistic_outer, .statistic_inner, .R)
replicates <- bootstrap_replicates_list_vectors (.input, .statistic_outer, .statistic_inner, .R, .reduction)
interval <- stats::quantile (replicates, probs = c ((1 - .confidence)/2, (1 + .confidence)/2))
middle <- .statistic_outer (vapply (.input, .statistic_inner, numeric (1)))

tibble (lo = interval [1], hi = interval [2], mid = middle)
}


#' Compute flat percentile bootstrap confidence interval on data set pair.
#'
#' @param .input_one Data set one to compute with.
#' @param .input_two Data set two to compute with.
#' @param .combination Combination to compute for both data sets.
#' @param .statistic Statistic to compute for each data set.
#' @param .confidence Confidence level.
#' @param .R Number of replicates.
#' @param .reduction Replicate size reduction.
#' @return Tibble with confidence interval.
#' @export
compute_vector_pair_flat_percentile_ci <- function (.input_one, .input_two, .combination = .Primitive ('-'), .statistic = mean, .confidence = 0.99, .R = 10000, .reduction = 0) {

replicates_one <- bootstrap_replicates_vector (.input_one, .statistic, .R, .reduction)
replicates_two <- bootstrap_replicates_vector (.input_two, .statistic, .R, .reduction)
replicates <- .combination (replicates_one, replicates_two)
interval <- stats::quantile (replicates, probs = c ((1 - .confidence)/2, (1 + .confidence)/2))
middle <- .combination (.statistic (.input_one), .statistic (.input_two))

tibble (lo = interval [1], hi = interval [2], mid = middle)
}


#' Compute hierarchical percentile bootstrap confidence interval on data set pair.
#'
#' @param .input_one Data set one to compute with.
#' @param .input_two Data set two to compute with.
#' @param .combination Combination to compute for both data sets.
#' @param .statistic_outer Statistic to compute at the outer level.
#' @param .statistic_inner Statistic to compute at the inner leve.
#' @param .confidence Confidence level.
#' @param .R Number of replicates.
#' @param .reduction Replicate size reduction.
#' @return Tibble with confidence interval.
#' @export
compute_list_vectors_pair_hierarchical_percentile_ci <- function (.input_one, .input_two, .combination = .Primitive ('-'), .statistic_outer = mean, .statistic_inner = mean, .confidence = 0.99, .R = 10000, .reduction = 0) {

replicates_one <- bootstrap_replicates_list_vectors (.input_one, .statistic_outer, .statistic_inner, .R, .reduction)
replicates_two <- bootstrap_replicates_list_vectors (.input_two, .statistic_outer, .statistic_inner, .R, .reduction)
replicates <- .combination (replicates_one, replicates_two)
interval <- stats::quantile (replicates, probs = c ((1 - .confidence)/2, (1 + .confidence)/2))
middle <- .combination (
.statistic_outer (vapply (.input_one, .statistic_inner, numeric (1))),
.statistic_outer (vapply (.input_two, .statistic_inner, numeric (1))))

tibble (lo = interval [1], hi = interval [2], mid = middle)
}


#' Compute flat percentile bootstrap confidence interval.
#'
#' Uses [compute_vector_flat_percentile_ci()] to compute percentile confidence interval on data that is not structured into runs.
Expand All @@ -96,7 +156,7 @@ compute_flat_percentile_ci <- function (.input, .column, ...) {
assert_renaissance (.input, .check_index = FALSE, .check_total = FALSE, .check_metadata = FALSE)

.input |>
group_by (.data $ vm, .data $ benchmark) |>
group_by (.data $ vm, across (starts_with ('vm_')), .data $ benchmark) |>
reframe (compute_vector_flat_percentile_ci ({{ .column }}, ...))
}

Expand All @@ -107,13 +167,81 @@ compute_flat_percentile_ci <- function (.input, .column, ...) {
#'
#' @param .input Data structured into runs.
#' @param .column Column to compute confidence interval for.
#' @param ... Parameters to [compute_vector_flat_percentile_ci()].
#' @param ... Parameters to [compute_list_vectors_hierarchical_percentile_ci()].
#' @return Summarized tibble with confidence interval columns.
#' @export
compute_hierarchical_percentile_ci <- function (.input, .column, ...) {
assert_renaissance (.input, .check_index = FALSE, .check_total = FALSE, .check_metadata = FALSE)

.input |>
group_by (.data $ vm, .data $ benchmark) |>
group_by (.data $ vm, across (starts_with ('vm_')), .data $ benchmark) |>
reframe (compute_list_vectors_hierarchical_percentile_ci (split ({{ .column }}, .data $ run, drop = TRUE), ...))
}


#' Compute flat relative percentile bootstrap confidence interval.
#'
#' Uses [compute_vector_pair_flat_percentile_ci()] to compute percentile
#' confidence interval on data that is not structured into runs.
#'
#' The statistic is computed relative to baseline virtual machine.
#'
#' @param .input Data not structured into runs.
#' @param .baseline Baseline virtual machine.
#' @param .column Column to compute confidence interval for.
#' @param ... Parameters to [compute_vector_pair_flat_percentile_ci()].
#' @return Summarized tibble with confidence interval columns.
#' @export
compute_flat_relative_percentile_ci <- function (.input, .baseline, .column, ...) {
assert_renaissance (.input, .check_index = FALSE, .check_total = FALSE, .check_metadata = FALSE)

# Subset baseline to reduce amount of filtering per benchmark.
baseline_for_vm <- .input |> filter (.data $ vm == .baseline)

baseline_for_benchmark <- function (benchmark) {
force (benchmark)
baseline <- baseline_for_vm |> filter (.data $ benchmark == .env $ benchmark)
return (baseline |> pull ({{ .column }}))
}

.input |>
group_by (.data $ vm, across (starts_with ('vm_')), .data $ benchmark) |>
reframe (compute_vector_pair_flat_percentile_ci (
{{ .column }},
baseline_for_benchmark (cur_group () $ benchmark),
...))
}


#' Compute hierarchical relative percentile bootstrap confidence interval.
#'
#' Uses [compute_list_vectors_pair_hierarchical_percentile_ci()] to compute percentile
#' confidence interval on data that is structured into runs.
#'
#' The statistic is computed relative to baseline virtual machine.
#'
#' @param .input Data structured into runs.
#' @param .baseline Baseline virtual machine.
#' @param .column Column to compute confidence interval for.
#' @param ... Parameters to [compute_list_vectors_pair_hierarchical_percentile_ci()].
#' @return Summarized tibble with confidence interval columns.
#' @export
compute_hierarchical_relative_percentile_ci <- function (.input, .baseline, .column, ...) {
assert_renaissance (.input, .check_index = FALSE, .check_total = FALSE, .check_metadata = FALSE)

# Subset baseline to reduce amount of filtering per benchmark.
baseline_for_vm <- .input |> filter (.data $ vm == .baseline)

baseline_for_benchmark <- function (benchmark) {
force (benchmark)
baseline <- baseline_for_vm |> filter (.data $ benchmark == .env $ benchmark)
return (split (baseline_for_vm |> pull ({{ .column }}), baseline_for_vm $ run, drop = TRUE))
}

.input |>
group_by (.data $ vm, across (starts_with ('vm_')), .data $ benchmark) |>
reframe (compute_list_vectors_pair_hierarchical_percentile_ci (
split ({{ .column }}, .data $ run, drop = TRUE),
baseline_for_benchmark (cur_group () $ benchmark),
...))
}
Loading