From 01d2a608238615d8be2a14b1dfaf6ef5d591ede2 Mon Sep 17 00:00:00 2001 From: Hy3n4 Date: Fri, 27 Oct 2023 14:40:40 +0200 Subject: [PATCH 1/3] feat(slo): use record rules in PrometheusRules basically this adds the functionality to use record rules over the pure metrics - add some more TODOs and fixes from previous merge Signed-off-by: Hy3n4 --- internal/controller/openslo/slo_controller.go | 128 ++++++++++++++---- internal/utils/common_utils.go | 22 +++ 2 files changed, 125 insertions(+), 25 deletions(-) diff --git a/internal/controller/openslo/slo_controller.go b/internal/controller/openslo/slo_controller.go index 28413c8..e5b49d8 100644 --- a/internal/controller/openslo/slo_controller.go +++ b/internal/controller/openslo/slo_controller.go @@ -109,6 +109,22 @@ func (r *SLOReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R sli.Spec.RatioMetric = slo.Spec.Indicator.Spec.RatioMetric } log.Info("SLI created", "SLI Name", sli.Name, "SLI Namespace", sli.Namespace, "SLI RatioMetric", sli.Spec.RatioMetric) + } else { + err = utils.UpdateStatus( + ctx, + slo, + r.Client, + "Ready", + metav1.ConditionFalse, + "SLIObjectNotFound", + "SLI Object not found", + ) + if err != nil { + log.Error(err, "Failed to update SLO status") + return ctrl.Result{}, err + } + log.Error(err, "SLO has no SLI reference") + return ctrl.Result{}, err } // Check if this PrometheusRule already exists @@ -140,21 +156,7 @@ func (r *SLOReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R } } - for _, rule := range promRule.Spec.Groups[0].Rules { - if rule.Expr != intstr.Parse(fmt.Sprintf("sum(rate(%s[%s])) / sum(rate(%s[%s]))", - sli.Spec.RatioMetric.Good.MetricSource.Spec, - slo.Spec.TimeWindow[0].Duration, - sli.Spec.RatioMetric.Total.MetricSource.Spec, - slo.Spec.TimeWindow[0].Duration, - )) { - promRule.Spec.Groups[0].Rules[0].Expr = intstr.Parse(fmt.Sprintf("sum(rate(%s[%s])) / sum(rate(%s[%s]))", - sli.Spec.RatioMetric.Good.MetricSource.Spec, - slo.Spec.TimeWindow[0].Duration, - sli.Spec.RatioMetric.Total.MetricSource.Spec, - slo.Spec.TimeWindow[0].Duration, - )) - } - } + //TODO: Update the PrometheusRule object and write the result back if there are any changes, possibly using reflect.DeepEqual and reflect.Copy if err := r.Update(ctx, promRule); err != nil { if apierrors.IsNotFound(err) { @@ -199,6 +201,89 @@ func (r *SLOReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R } func (r *SLOReconciler) createPrometheusRule(slo *openslov1.SLO, sli *openslov1.SLI) (*monitoringv1.PrometheusRule, error) { + var monitoringRules []monitoringv1.Rule + var totalRule monitoringv1.Rule + var goodRule monitoringv1.Rule + var badRule monitoringv1.Rule + defaultRateWindow := "1m" + + // for now, total and good are required. bad is optional and is calculated as (total - good) if not provided + // TODO: validate that the SLO budgeting method is Occurrences and that the SLIs are all ratio metrics in other case throw an error + totalRule.Record = fmt.Sprintf("osko:ratio_indicator_total:rate%s", defaultRateWindow) + totalRule.Expr = intstr.Parse(fmt.Sprintf("sum(increase(%s[%s]))", + sli.Spec.RatioMetric.Total.MetricSource.Spec, + defaultRateWindow, + )) + totalRule.Labels = utils.MergeLabels( + map[string]string{ + "metric": utils.ExtractMetricNameFromQuery(sli.Spec.RatioMetric.Total.MetricSource.Spec), + }, + slo.Labels, + ) + + monitoringRules = append(monitoringRules, totalRule) + + goodRule.Record = fmt.Sprintf("osko:ratio_indicator_good:rate%s", defaultRateWindow) + goodRule.Expr = intstr.Parse(fmt.Sprintf("sum(increase(%s[%s]))", + sli.Spec.RatioMetric.Good.MetricSource.Spec, + defaultRateWindow, + )) + goodRule.Labels = utils.MergeLabels( + map[string]string{ + "metric": utils.ExtractMetricNameFromQuery(sli.Spec.RatioMetric.Good.MetricSource.Spec), + }, + slo.Labels, + ) + + monitoringRules = append(monitoringRules, goodRule) + + basicRuleQuery := fmt.Sprintf("(1-%s) * %s[%s:%s] - (%s[%s:%s] - %s[%s:%s])", + slo.Spec.Objectives[0].Target, + totalRule.Record, + slo.Spec.TimeWindow[0].Duration, + defaultRateWindow, + totalRule.Record, + slo.Spec.TimeWindow[0].Duration, + defaultRateWindow, + goodRule.Record, + slo.Spec.TimeWindow[0].Duration, + defaultRateWindow, + ) + + if sli.Spec.RatioMetric.Bad != (openslov1.MetricSpec{}) { + badRule.Record = fmt.Sprintf("osko:ratio_indicator_bad:rate%s", defaultRateWindow) + badRule.Expr = intstr.Parse(fmt.Sprintf("sum(increase(%s[%s]))", + sli.Spec.RatioMetric.Bad.MetricSource.Spec, + defaultRateWindow, + )) + badRule.Labels = utils.MergeLabels( + map[string]string{ + "metric": utils.ExtractMetricNameFromQuery(sli.Spec.RatioMetric.Bad.MetricSource.Spec), + }, + slo.Labels, + ) + basicRuleQuery = fmt.Sprintf("(1-%s) * %s[%s:%s] - %s[%s:%s])", + slo.Spec.Objectives[0].Target, + totalRule.Record, + slo.Spec.TimeWindow[0].Duration, + defaultRateWindow, + badRule.Expr.StrVal, + slo.Spec.TimeWindow[0].Duration, + defaultRateWindow, + ) + monitoringRules = append(monitoringRules, badRule) + } + + mRule := monitoringv1.Rule{ + Record: fmt.Sprintf("osko:error_budget:rate%s", slo.Spec.TimeWindow[0].Duration), + Expr: intstr.Parse(fmt.Sprint(basicRuleQuery)), + Labels: utils.MergeLabels( + slo.Labels, + ), + } + + monitoringRules = append(monitoringRules, mRule) + rule := &monitoringv1.PrometheusRule{ TypeMeta: metav1.TypeMeta{ APIVersion: "monitoring.coreos.com/v1", @@ -212,18 +297,11 @@ func (r *SLOReconciler) createPrometheusRule(slo *openslov1.SLO, sli *openslov1. }, Spec: monitoringv1.PrometheusRuleSpec{ Groups: []monitoringv1.RuleGroup{{ - Name: slo.Name, - Rules: []monitoringv1.Rule{{ - Expr: intstr.Parse(fmt.Sprintf("sum(rate(%s[%s])) / sum(rate(%s[%s]))", - sli.Spec.RatioMetric.Good.MetricSource.Spec, - slo.Spec.TimeWindow[0].Duration, - sli.Spec.RatioMetric.Total.MetricSource.Spec, - slo.Spec.TimeWindow[0].Duration, - )), - }}}}, + Name: slo.Name, + Rules: monitoringRules, + }}, }, } - // Set SLO instance as the owner and controller. err := ctrl.SetControllerReference(slo, rule, r.Scheme) if err != nil { diff --git a/internal/utils/common_utils.go b/internal/utils/common_utils.go index 91cb730..b79cf45 100644 --- a/internal/utils/common_utils.go +++ b/internal/utils/common_utils.go @@ -5,6 +5,7 @@ import ( openslov1 "github.com/oskoperator/osko/apis/openslo/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" + "strings" "time" ) @@ -57,3 +58,24 @@ func UpdateStatus(ctx context.Context, slo *openslov1.SLO, r client.Client, cond slo.Status.Ready = reason return r.Status().Update(ctx, slo) } + +func ExtractMetricNameFromQuery(query string) string { + index := strings.Index(query, "{") + if index == -1 { + return "" + } + + subStr := query[:index] + return subStr +} + +func MergeLabels(ms ...map[string]string) map[string]string { + res := map[string]string{} + for _, m := range ms { + for k, v := range m { + res[k] = v + } + } + + return res +} From e8e5b4958371ca83e996e33fdefc22c2ec5fb27f Mon Sep 17 00:00:00 2001 From: Hy3n4 Date: Mon, 30 Oct 2023 13:28:25 +0100 Subject: [PATCH 2/3] feat(slo): labels and rule naming based on ADR Signed-off-by: Hy3n4 --- internal/controller/openslo/slo_controller.go | 35 ++++++------------- internal/utils/common_utils.go | 9 +++++ 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/internal/controller/openslo/slo_controller.go b/internal/controller/openslo/slo_controller.go index e5b49d8..78b1af8 100644 --- a/internal/controller/openslo/slo_controller.go +++ b/internal/controller/openslo/slo_controller.go @@ -209,31 +209,21 @@ func (r *SLOReconciler) createPrometheusRule(slo *openslov1.SLO, sli *openslov1. // for now, total and good are required. bad is optional and is calculated as (total - good) if not provided // TODO: validate that the SLO budgeting method is Occurrences and that the SLIs are all ratio metrics in other case throw an error - totalRule.Record = fmt.Sprintf("osko:ratio_indicator_total:rate%s", defaultRateWindow) + totalRule.Record = fmt.Sprintf("osko_sli_ratio_total") totalRule.Expr = intstr.Parse(fmt.Sprintf("sum(increase(%s[%s]))", sli.Spec.RatioMetric.Total.MetricSource.Spec, defaultRateWindow, )) - totalRule.Labels = utils.MergeLabels( - map[string]string{ - "metric": utils.ExtractMetricNameFromQuery(sli.Spec.RatioMetric.Total.MetricSource.Spec), - }, - slo.Labels, - ) + totalRule.Labels = utils.GenerateMetricLabels(slo, sli) monitoringRules = append(monitoringRules, totalRule) - goodRule.Record = fmt.Sprintf("osko:ratio_indicator_good:rate%s", defaultRateWindow) + goodRule.Record = fmt.Sprintf("osko_sli_ratio_good") goodRule.Expr = intstr.Parse(fmt.Sprintf("sum(increase(%s[%s]))", sli.Spec.RatioMetric.Good.MetricSource.Spec, defaultRateWindow, )) - goodRule.Labels = utils.MergeLabels( - map[string]string{ - "metric": utils.ExtractMetricNameFromQuery(sli.Spec.RatioMetric.Good.MetricSource.Spec), - }, - slo.Labels, - ) + goodRule.Labels = utils.GenerateMetricLabels(slo, sli) monitoringRules = append(monitoringRules, goodRule) @@ -251,17 +241,12 @@ func (r *SLOReconciler) createPrometheusRule(slo *openslov1.SLO, sli *openslov1. ) if sli.Spec.RatioMetric.Bad != (openslov1.MetricSpec{}) { - badRule.Record = fmt.Sprintf("osko:ratio_indicator_bad:rate%s", defaultRateWindow) + badRule.Record = fmt.Sprint("osko_sli_ratio_bad") badRule.Expr = intstr.Parse(fmt.Sprintf("sum(increase(%s[%s]))", sli.Spec.RatioMetric.Bad.MetricSource.Spec, defaultRateWindow, )) - badRule.Labels = utils.MergeLabels( - map[string]string{ - "metric": utils.ExtractMetricNameFromQuery(sli.Spec.RatioMetric.Bad.MetricSource.Spec), - }, - slo.Labels, - ) + badRule.Labels = utils.GenerateMetricLabels(slo, sli) basicRuleQuery = fmt.Sprintf("(1-%s) * %s[%s:%s] - %s[%s:%s])", slo.Spec.Objectives[0].Target, totalRule.Record, @@ -275,15 +260,15 @@ func (r *SLOReconciler) createPrometheusRule(slo *openslov1.SLO, sli *openslov1. } mRule := monitoringv1.Rule{ - Record: fmt.Sprintf("osko:error_budget:rate%s", slo.Spec.TimeWindow[0].Duration), + Record: fmt.Sprint("osko_error_budget_available"), Expr: intstr.Parse(fmt.Sprint(basicRuleQuery)), - Labels: utils.MergeLabels( - slo.Labels, - ), + Labels: utils.GenerateMetricLabels(slo, sli), } monitoringRules = append(monitoringRules, mRule) + // Calculate Error ratios for 1h, 6h + rule := &monitoringv1.PrometheusRule{ TypeMeta: metav1.TypeMeta{ APIVersion: "monitoring.coreos.com/v1", diff --git a/internal/utils/common_utils.go b/internal/utils/common_utils.go index b79cf45..6c4b96c 100644 --- a/internal/utils/common_utils.go +++ b/internal/utils/common_utils.go @@ -69,6 +69,15 @@ func ExtractMetricNameFromQuery(query string) string { return subStr } +func GenerateMetricLabels(slo *openslov1.SLO, sli *openslov1.SLI) map[string]string { + return map[string]string{ + "sli_name": sli.Name, + "slo_name": slo.Name, + "service": slo.Spec.Service, + "window": string(slo.Spec.TimeWindow[0].Duration), + } +} + func MergeLabels(ms ...map[string]string) map[string]string { res := map[string]string{} for _, m := range ms { From e694079df77db8a860ce1ecf4e7156e3eb4da1c8 Mon Sep 17 00:00:00 2001 From: Hy3n4 Date: Tue, 31 Oct 2023 08:08:22 +0100 Subject: [PATCH 3/3] feat(slo): ability to use metric labels Now we are able to use labels in the recording rules Signed-off-by: Hy3n4 --- internal/controller/openslo/slo_controller.go | 45 ++++++++++++++----- internal/utils/common_utils.go | 41 ++++++++++++++--- 2 files changed, 69 insertions(+), 17 deletions(-) diff --git a/internal/controller/openslo/slo_controller.go b/internal/controller/openslo/slo_controller.go index 78b1af8..ec2efee 100644 --- a/internal/controller/openslo/slo_controller.go +++ b/internal/controller/openslo/slo_controller.go @@ -205,7 +205,11 @@ func (r *SLOReconciler) createPrometheusRule(slo *openslov1.SLO, sli *openslov1. var totalRule monitoringv1.Rule var goodRule monitoringv1.Rule var badRule monitoringv1.Rule + var ratioRule monitoringv1.Rule defaultRateWindow := "1m" + burnRateTimeWindows := []string{"1h", "6h", "3d"} + l := utils.LabelGeneratorParams{Slo: slo, Sli: sli} + m := utils.MetricLabelParams{Slo: slo, Sli: sli} // for now, total and good are required. bad is optional and is calculated as (total - good) if not provided // TODO: validate that the SLO budgeting method is Occurrences and that the SLIs are all ratio metrics in other case throw an error @@ -214,7 +218,7 @@ func (r *SLOReconciler) createPrometheusRule(slo *openslov1.SLO, sli *openslov1. sli.Spec.RatioMetric.Total.MetricSource.Spec, defaultRateWindow, )) - totalRule.Labels = utils.GenerateMetricLabels(slo, sli) + totalRule.Labels = l.NewMetricLabelGenerator() monitoringRules = append(monitoringRules, totalRule) @@ -223,21 +227,21 @@ func (r *SLOReconciler) createPrometheusRule(slo *openslov1.SLO, sli *openslov1. sli.Spec.RatioMetric.Good.MetricSource.Spec, defaultRateWindow, )) - goodRule.Labels = utils.GenerateMetricLabels(slo, sli) + goodRule.Labels = l.NewMetricLabelGenerator() monitoringRules = append(monitoringRules, goodRule) - basicRuleQuery := fmt.Sprintf("(1-%s) * %s[%s:%s] - (%s[%s:%s] - %s[%s:%s])", + basicRuleQuery := fmt.Sprintf("(1-%s) * sum(increase(%s{%s}[%s])) - (sum(increase(%s{%s}[%s])) - sum(increase(%s{%s}[%s])))", slo.Spec.Objectives[0].Target, totalRule.Record, + m.NewMetricLabelCompiler(), slo.Spec.TimeWindow[0].Duration, - defaultRateWindow, totalRule.Record, + m.NewMetricLabelCompiler(), slo.Spec.TimeWindow[0].Duration, - defaultRateWindow, goodRule.Record, + m.NewMetricLabelCompiler(), slo.Spec.TimeWindow[0].Duration, - defaultRateWindow, ) if sli.Spec.RatioMetric.Bad != (openslov1.MetricSpec{}) { @@ -246,15 +250,15 @@ func (r *SLOReconciler) createPrometheusRule(slo *openslov1.SLO, sli *openslov1. sli.Spec.RatioMetric.Bad.MetricSource.Spec, defaultRateWindow, )) - badRule.Labels = utils.GenerateMetricLabels(slo, sli) - basicRuleQuery = fmt.Sprintf("(1-%s) * %s[%s:%s] - %s[%s:%s])", + badRule.Labels = l.NewMetricLabelGenerator() + basicRuleQuery = fmt.Sprintf("(1-%s) * sum(increase(%s{%s}[%s])) - sum(increase(%s{%s}[%s])))", slo.Spec.Objectives[0].Target, totalRule.Record, + m.NewMetricLabelCompiler(), slo.Spec.TimeWindow[0].Duration, - defaultRateWindow, badRule.Expr.StrVal, + m.NewMetricLabelCompiler(), slo.Spec.TimeWindow[0].Duration, - defaultRateWindow, ) monitoringRules = append(monitoringRules, badRule) } @@ -262,12 +266,29 @@ func (r *SLOReconciler) createPrometheusRule(slo *openslov1.SLO, sli *openslov1. mRule := monitoringv1.Rule{ Record: fmt.Sprint("osko_error_budget_available"), Expr: intstr.Parse(fmt.Sprint(basicRuleQuery)), - Labels: utils.GenerateMetricLabels(slo, sli), + Labels: l.NewMetricLabelGenerator(), } monitoringRules = append(monitoringRules, mRule) - // Calculate Error ratios for 1h, 6h + // Calculate Error ratios for 1h, 6h, 3d + for _, timeWindow := range burnRateTimeWindows { + l.TimeWindow = timeWindow + ratioRule.Record = fmt.Sprintf("osko_sli_ratio") + ratioRule.Expr = intstr.Parse(fmt.Sprintf("(sum(increase(%s{%s}[%s]))-sum(increase(%s{%s}[%s])))/sum(increase(%s{%s}[%s]))", + totalRule.Record, + m.NewMetricLabelCompiler(), + timeWindow, + goodRule.Record, + m.NewMetricLabelCompiler(), + timeWindow, + totalRule.Record, + m.NewMetricLabelCompiler(), + timeWindow, + )) + ratioRule.Labels = l.NewMetricLabelGenerator() + monitoringRules = append(monitoringRules, ratioRule) + } rule := &monitoringv1.PrometheusRule{ TypeMeta: metav1.TypeMeta{ diff --git a/internal/utils/common_utils.go b/internal/utils/common_utils.go index 6c4b96c..9683526 100644 --- a/internal/utils/common_utils.go +++ b/internal/utils/common_utils.go @@ -9,6 +9,19 @@ import ( "time" ) +type LabelGeneratorParams struct { + Slo *openslov1.SLO + Sli *openslov1.SLI + TimeWindow string +} + +type MetricLabelParams struct { + Slo *openslov1.SLO + Sli *openslov1.SLI + TimeWindow string + Labels map[string]string +} + // UpdateCondition checks if the condition of the given type is already in the slice // if the condition already exists and has the same status, return the unmodified conditions // if the condition exists and has a different status, remove it and add the new one @@ -69,12 +82,30 @@ func ExtractMetricNameFromQuery(query string) string { return subStr } -func GenerateMetricLabels(slo *openslov1.SLO, sli *openslov1.SLI) map[string]string { +func (m MetricLabelParams) NewMetricLabelCompiler() string { + window := string(m.Slo.Spec.TimeWindow[0].Duration) + if m.TimeWindow != "" { + window = m.TimeWindow + } + + labelString := `sli_name="` + m.Sli.Name + `", slo_name="` + m.Slo.Name + `", service="` + m.Slo.Spec.Service + `", window="` + window + `"` + for k, v := range m.Labels { + labelString += `, ` + k + `="` + v + `"` + } + + return labelString +} + +func (l LabelGeneratorParams) NewMetricLabelGenerator() map[string]string { + window := string(l.Slo.Spec.TimeWindow[0].Duration) + if l.TimeWindow != "" { + window = l.TimeWindow + } return map[string]string{ - "sli_name": sli.Name, - "slo_name": slo.Name, - "service": slo.Spec.Service, - "window": string(slo.Spec.TimeWindow[0].Duration), + "sli_name": l.Sli.Name, + "slo_name": l.Slo.Name, + "service": l.Slo.Spec.Service, + "window": window, } }