From 187c10aaf6b8a3a718ba26e49e36f366d6b9bf13 Mon Sep 17 00:00:00 2001 From: Nikhil-Ladha Date: Tue, 26 Nov 2024 17:11:32 +0530 Subject: [PATCH] metrics: add basic common metric utilities add common metric utilities to be used by the controller to export metrics related to external-snapshot-metadata Signed-off-by: Nikhil-Ladha --- go.mod | 8 +- pkg/metrics/metrics.go | 265 +++++++++++++ pkg/metrics/metrics_test.go | 740 ++++++++++++++++++++++++++++++++++++ 3 files changed, 1009 insertions(+), 4 deletions(-) create mode 100644 pkg/metrics/metrics.go create mode 100644 pkg/metrics/metrics_test.go diff --git a/go.mod b/go.mod index 58052b3e..a9b3a7cd 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,9 @@ require ( github.com/kubernetes-csi/external-snapshot-metadata/client v0.0.0-20240708191355-ca55d80f214a github.com/kubernetes-csi/external-snapshotter/client/v8 v8.0.0 github.com/kubernetes-csi/external-snapshotter/v8 v8.1.0 + github.com/prometheus/client_golang v1.20.5 + github.com/prometheus/client_model v0.6.1 + github.com/prometheus/common v0.60.1 github.com/stretchr/testify v1.9.0 google.golang.org/grpc v1.67.1 google.golang.org/protobuf v1.35.1 @@ -17,6 +20,7 @@ require ( k8s.io/apimachinery v0.31.1 k8s.io/apiserver v0.31.1 k8s.io/client-go v0.31.1 + k8s.io/component-base v0.31.1 k8s.io/klog/v2 v2.130.1 ) @@ -48,9 +52,6 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.20.5 // indirect - github.com/prometheus/client_model v0.6.1 // indirect - github.com/prometheus/common v0.60.1 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/x448/float16 v0.8.4 // indirect @@ -69,7 +70,6 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/component-base v0.31.1 // indirect k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 00000000..8854c78a --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,265 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "net/http" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/apimachinery/pkg/types" + k8smetrics "k8s.io/component-base/metrics" +) + +const ( + opStatusUnknown = "Unknown" + labelDriverName = "driver_name" + labelOperationName = "operation_name" + labelOperationStatus = "operation_status" + subSystem = "snapshot_metadata_controller" + operationLatencyMetricName = "operation_total_seconds" + operationLatencyMetricHelpMsg = "Total number of seconds spent by the controller on an operation" + operationInFlightName = "operations_in_flight" + operationInFlightHelpMsg = "Total number of operations in flight" + unknownDriverName = "unknown" + + // DynamicSnapshotType represents a snapshot that is being dynamically provisioned + DynamicSnapshotType = snapshotProvisionType("dynamic") + // PreProvisionedSnapshotType represents a snapshot that is pre-provisioned + PreProvisionedSnapshotType = snapshotProvisionType("pre-provisioned") +) + +var ( + inFlightCheckInterval = 30 * time.Second +) + +// OperationStatus is the interface type for representing an operation's execution +// status, with the nil value representing an "Unknown" status of the operation. +type OperationStatus interface { + String() string +} + +var metricBuckets = []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 300, 600} + +type MetricsManager interface { + // PrepareMetricsPath prepares the metrics path the specified pattern for + // metrics managed by this MetricsManager. + // If the "pattern" is empty (i.e., ""), it will not be registered. + // An error will be returned if there is any. + PrepareMetricsPath(mux *http.ServeMux, pattern string, logger promhttp.Logger) error + + // OperationStart takes in an operation and caches its start time. + // if the operation already exists, it's an no-op. + OperationStart(key OperationKey, val OperationValue) + + // DropOperation removes an operation from cache. + // if the operation does not exist, it's an no-op. + DropOperation(op OperationKey) + + // RecordMetrics records a metric point. Note that it will be an no-op if an + // operation has NOT been marked "Started" previously via invoking "OperationStart". + // Invoking of RecordMetrics effectively removes the cached entry. + // op - the operation which the metric is associated with. + // status - the operation status, if not specified, i.e., status == nil, an + // "Unknown" status of the passed-in operation is assumed. + RecordMetrics(op OperationKey, status OperationStatus, driverName string) + + // GetRegistry() returns the metrics.KubeRegistry used by this metrics manager. + GetRegistry() k8smetrics.KubeRegistry +} + +// OperationKey is a structure which holds information to +// uniquely identify a snapshot related operation +type OperationKey struct { + // Name is the name of the operation + Name string + // ResourceID is the resource UID to which the operation has been executed against + ResourceID types.UID +} + +// OperationValue is a structure which holds operation metadata +type OperationValue struct { + // Driver is the driver name which executes the operation + Driver string + // SnapshotType represents the snapshot type, for example: "dynamic", "pre-provisioned" + SnapshotType string + // startTime is the time when the operation first started + startTime time.Time +} + +// NewOperationKey initializes a new OperationKey +func NewOperationKey(name string, resourceUID types.UID) OperationKey { + return OperationKey{ + Name: name, + ResourceID: resourceUID, + } +} + +// NewOperationValue initializes a new OperationValue +func NewOperationValue(driver string, snapshotType snapshotProvisionType) OperationValue { + if driver == "" { + driver = unknownDriverName + } + + return OperationValue{ + Driver: driver, + SnapshotType: string(snapshotType), + } +} + +type operationMetricsManager struct { + // cache is a concurrent-safe map which stores start timestamps for all + // ongoing operations. + // key is an Operation + // value is the timestamp of the start time of the operation + cache map[OperationKey]OperationValue + + // mutex for protecting cache from concurrent access + mu sync.Mutex + + // registry is a wrapper around Prometheus Registry + registry k8smetrics.KubeRegistry + + // opLatencyMetrics is a Histogram metrics for operation time per request + opLatencyMetrics *k8smetrics.HistogramVec + + // opInFlight is a Gauge metric for the number of operations in flight + opInFlight *k8smetrics.Gauge +} + +// NewMetricsManager creates a new MetricsManager instance +func NewMetricsManager() MetricsManager { + mgr := &operationMetricsManager{ + cache: make(map[OperationKey]OperationValue), + } + mgr.init() + return mgr +} + +// OperationStart starts a new operation +func (opMgr *operationMetricsManager) OperationStart(key OperationKey, val OperationValue) { + opMgr.mu.Lock() + defer opMgr.mu.Unlock() + + if _, exists := opMgr.cache[key]; !exists { + val.startTime = time.Now() + opMgr.cache[key] = val + } + opMgr.opInFlight.Set(float64(len(opMgr.cache))) +} + +// DropOperation drops an operation +func (opMgr *operationMetricsManager) DropOperation(op OperationKey) { + opMgr.mu.Lock() + defer opMgr.mu.Unlock() + delete(opMgr.cache, op) + opMgr.opInFlight.Set(float64(len(opMgr.cache))) +} + +// RecordMetrics emits operation metrics +func (opMgr *operationMetricsManager) RecordMetrics(opKey OperationKey, opStatus OperationStatus, driverName string) { + opMgr.mu.Lock() + defer opMgr.mu.Unlock() + opVal, exists := opMgr.cache[opKey] + if !exists { + // the operation has not been cached, return directly + return + } + + strStatus := opStatusUnknown + if opStatus != nil { + strStatus = opStatus.String() + } + + // if we do not know the driverName while recording metrics, + // refer to the cached version instead. + if driverName == "" || driverName == unknownDriverName { + driverName = opVal.Driver + } + + operationDuration := time.Since(opVal.startTime).Seconds() + opMgr.opLatencyMetrics.WithLabelValues(driverName, opKey.Name, opVal.Driver, strStatus).Observe(operationDuration) + + delete(opMgr.cache, opKey) +} + +func (opMgr *operationMetricsManager) init() { + opMgr.registry = k8smetrics.NewKubeRegistry() + k8smetrics.RegisterProcessStartTime(opMgr.registry.Register) + opMgr.opLatencyMetrics = k8smetrics.NewHistogramVec( + &k8smetrics.HistogramOpts{ + Subsystem: subSystem, + Name: operationLatencyMetricName, + Help: operationLatencyMetricHelpMsg, + Buckets: metricBuckets, + }, + []string{labelDriverName, labelOperationName, labelOperationStatus}, + ) + opMgr.registry.MustRegister(opMgr.opLatencyMetrics) + opMgr.opInFlight = k8smetrics.NewGauge( + &k8smetrics.GaugeOpts{ + Subsystem: subSystem, + Name: operationInFlightName, + Help: operationInFlightHelpMsg, + }, + ) + opMgr.registry.MustRegister(opMgr.opInFlight) + + // While we always maintain the number of operations in flight + // for every metrics operation start/finish, if any are leaked, + // this scheduled routine will catch any leaked operations. + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go opMgr.scheduleOpsInFlightMetric(ctx) +} + +func (opMgr *operationMetricsManager) scheduleOpsInFlightMetric(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + default: + for range time.NewTicker(inFlightCheckInterval).C { + func() { + opMgr.mu.Lock() + defer opMgr.mu.Unlock() + opMgr.opInFlight.Set(float64(len(opMgr.cache))) + }() + } + } + } +} + +func (opMgr *operationMetricsManager) PrepareMetricsPath(mux *http.ServeMux, pattern string, logger promhttp.Logger) error { + mux.Handle(pattern, k8smetrics.HandlerFor( + opMgr.registry, + k8smetrics.HandlerOpts{ + ErrorLog: logger, + ErrorHandling: k8smetrics.ContinueOnError, + })) + + return nil +} + +func (opMgr *operationMetricsManager) GetRegistry() k8smetrics.KubeRegistry { + return opMgr.registry +} + +// snapshotProvisionType represents which kind of snapshot a metric is +type snapshotProvisionType string diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go new file mode 100644 index 00000000..5ce9b0fe --- /dev/null +++ b/pkg/metrics/metrics_test.go @@ -0,0 +1,740 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "fmt" + "io" + "log" + "net" + "net/http" + "reflect" + "sort" + "strings" + "sync" + "testing" + "time" + + cmg "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + "k8s.io/apimachinery/pkg/types" +) + +var ( + statusMap map[int]string = map[int]string{ + 0: "Success", + 1: "Failure", + 2: "Unknown", + } +) + +const ( + httpPattern = "/metrics" + addr = "localhost:0" + processStartTimeMetric = "process_start_time_seconds" +) + +type fakeOpStatus struct { + statusCode int +} + +func (s *fakeOpStatus) String() string { + if str, ok := statusMap[s.statusCode]; ok { + return str + } + return "Unknown" +} + +func initMgr() (MetricsManager, *http.Server) { + mgr := NewMetricsManager() + mux := http.NewServeMux() + err := mgr.PrepareMetricsPath(mux, httpPattern, nil) + if err != nil { + log.Fatalf("failed to start serving [%v]", err) + } + l, err := net.Listen("tcp", addr) + if err != nil { + log.Fatalf("failed to listen on address[%s], error[%v]", addr, err) + } + srv := &http.Server{Addr: l.Addr().String(), Handler: mux} + go func() { + if err := srv.Serve(l); err != http.ErrServerClosed { + log.Fatalf("failed to start endpoint at:%s/%s, error: %v", addr, httpPattern, err) + } + }() + + return mgr, srv +} + +func shutdown(srv *http.Server) { + if err := srv.Shutdown(context.Background()); err != nil { + panic(err) + } +} + +func TestNew(t *testing.T) { + mgr, srv := initMgr() + defer shutdown(srv) + if mgr == nil { + t.Errorf("failed testing new") + } +} + +func TestDropNonExistingOperation(t *testing.T) { + mgr, srv := initMgr() + defer shutdown(srv) + op := OperationKey{ + Name: "drop-non-existing-operation-should-be-noop", + ResourceID: types.UID("uid"), + } + mgr.DropOperation(op) +} + +func TestRecordMetricsForNonExistingOperation(t *testing.T) { + mgr, srv := initMgr() + srvAddr := "http://" + srv.Addr + httpPattern + defer shutdown(srv) + opKey := OperationKey{ + Name: "no-metrics-should-be-recorded-as-operation-did-not-start", + ResourceID: types.UID("uid"), + } + mgr.RecordMetrics(opKey, nil, "driver") + rsp, err := http.Get(srvAddr) + if err != nil || rsp.StatusCode != http.StatusOK { + t.Errorf("failed to get response from server %v, %v", err, rsp) + } + r, err := io.ReadAll(rsp.Body) + if err != nil { + t.Errorf("failed to read response body %v", err) + } + if strings.Contains(string(r), opKey.Name) { + t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r)) + } +} + +func TestDropOperation(t *testing.T) { + mgr, srv := initMgr() + srvAddr := "http://" + srv.Addr + httpPattern + defer shutdown(srv) + opKey := OperationKey{ + Name: "should-have-been-dropped", + ResourceID: types.UID("uid"), + } + opVal := NewOperationValue("driver", DynamicSnapshotType) + mgr.OperationStart(opKey, opVal) + mgr.DropOperation(opKey) + time.Sleep(300 * time.Millisecond) + rsp, err := http.Get(srvAddr) + if err != nil || rsp.StatusCode != http.StatusOK { + t.Errorf("failed to get response from server %v, %v", err, rsp) + } + r, err := io.ReadAll(rsp.Body) + if err != nil { + t.Errorf("failed to read response body %v", err) + } + if strings.Contains(string(r), opKey.Name) { + t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r)) + } + // re-add with a different name + opKey.Name = "should-have-been-added" + mgr.OperationStart(opKey, opVal) + time.Sleep(300 * time.Millisecond) + opStatus := &fakeOpStatus{ + statusCode: 0, + } + mgr.RecordMetrics(opKey, opStatus, "driver") + expected := + `# HELP snapshot_metadata_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end +# TYPE snapshot_metadata_controller_operation_total_seconds histogram +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.25"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="1"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_metadata_controller_operation_total_seconds_sum{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 0.3 +snapshot_metadata_controller_operation_total_seconds_count{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 1 +` + + if err := verifyMetric(expected, srvAddr); err != nil { + t.Errorf("failed testing[%v]", err) + } +} + +func TestUnknownStatus(t *testing.T) { + mgr, srv := initMgr() + srvAddr := "http://" + srv.Addr + httpPattern + defer shutdown(srv) + opKey := OperationKey{ + Name: "unknown-status-operation", + ResourceID: types.UID("uid"), + } + mgr.OperationStart(opKey, NewOperationValue("driver", DynamicSnapshotType)) + // should create a Unknown data point with latency ~300ms + time.Sleep(300 * time.Millisecond) + mgr.RecordMetrics(opKey, nil, "driver") + expected := + `# HELP snapshot_metadata_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end +# TYPE snapshot_metadata_controller_operation_total_seconds histogram +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.1"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.25"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="1"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="2.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="10"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="15"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="30"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="60"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="120"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="300"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="600"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="+Inf"} 1 +snapshot_metadata_controller_operation_total_seconds_sum{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 0.3 +snapshot_metadata_controller_operation_total_seconds_count{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 1 +` + + if err := verifyMetric(expected, srvAddr); err != nil { + t.Errorf("failed testing[%v]", err) + } +} + +func TestRecordMetrics(t *testing.T) { + mgr, srv := initMgr() + srvAddr := "http://" + srv.Addr + httpPattern + defer shutdown(srv) + // add an operation + opKey := OperationKey{ + Name: "op1", + ResourceID: types.UID("uid1"), + } + opVal := NewOperationValue("driver1", DynamicSnapshotType) + mgr.OperationStart(opKey, opVal) + // should create a Success data point with latency ~ 1100ms + time.Sleep(1100 * time.Millisecond) + success := &fakeOpStatus{ + statusCode: 0, + } + mgr.RecordMetrics(opKey, success, "driver1") + + // add another operation metric + opKey.Name = "op2" + opKey.ResourceID = types.UID("uid2") + mgr.OperationStart(opKey, opVal) + // should create a Failure data point with latency ~ 100ms + time.Sleep(100 * time.Millisecond) + failure := &fakeOpStatus{ + statusCode: 1, + } + mgr.RecordMetrics(opKey, failure, "driver2") + + expected := + `# HELP snapshot_metadata_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end +# TYPE snapshot_metadata_controller_operation_total_seconds histogram +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.25"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.5"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="1"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_metadata_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1.1 +snapshot_metadata_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.1"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.25"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="1"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="2.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="10"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="15"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="30"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="60"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="120"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="300"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="600"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="+Inf"} 1 +snapshot_metadata_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 0.1 +snapshot_metadata_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 1 +` + if err := verifyMetric(expected, srvAddr); err != nil { + t.Errorf("failed testing [%v]", err) + } +} + +func TestConcurrency(t *testing.T) { + mgr, srv := initMgr() + srvAddr := "http://" + srv.Addr + httpPattern + defer shutdown(srv) + success := &fakeOpStatus{ + statusCode: 0, + } + failure := &fakeOpStatus{ + statusCode: 1, + } + ops := []struct { + op OperationKey + desiredLatencyMillisec time.Duration + status OperationStatus + drop bool + }{ + { + OperationKey{ + Name: "success1", + ResourceID: types.UID("uid1"), + }, + 100, + success, + false, + }, + { + OperationKey{ + Name: "success2", + ResourceID: types.UID("uid2"), + }, + 100, + success, + false, + }, + { + OperationKey{ + Name: "failure1", + ResourceID: types.UID("uid3"), + }, + 100, + failure, + false, + }, + { + OperationKey{ + Name: "failure2", + ResourceID: types.UID("uid4"), + }, + 100, + failure, + false, + }, + { + OperationKey{ + Name: "unknown", + ResourceID: types.UID("uid5"), + }, + 100, + nil, + false, + }, + { + OperationKey{ + Name: "drop-from-cache", + ResourceID: types.UID("uid6"), + }, + 100, + nil, + true, + }, + } + + for i := range ops { + mgr.OperationStart(ops[i].op, OperationValue{ + Driver: fmt.Sprintf("driver%v", i), + SnapshotType: string(DynamicSnapshotType), + }) + } + // add an extra operation which should remain in the cache + remaining := OperationKey{ + Name: "remaining-in-cache", + ResourceID: types.UID("uid7"), + } + mgr.OperationStart(remaining, OperationValue{ + Driver: "driver7", + SnapshotType: string(DynamicSnapshotType), + }) + + var wgMetrics sync.WaitGroup + + for i := range ops { + wgMetrics.Add(1) + go func(i int) { + defer wgMetrics.Done() + if ops[i].desiredLatencyMillisec > 0 { + time.Sleep(ops[i].desiredLatencyMillisec * time.Millisecond) + } + if ops[i].drop { + mgr.DropOperation(ops[i].op) + } else { + mgr.RecordMetrics(ops[i].op, ops[i].status, fmt.Sprintf("driver%v", i)) + } + }(i) + } + wgMetrics.Wait() + + // validate + expected := + `# HELP snapshot_metadata_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end +# TYPE snapshot_metadata_controller_operation_total_seconds histogram +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.25"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="1"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_metadata_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 0.1 +snapshot_metadata_controller_operation_total_seconds_count{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.25"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="1"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_metadata_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 0.1 +snapshot_metadata_controller_operation_total_seconds_count{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.1"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.25"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="1"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="2.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="10"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="15"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="30"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="60"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="120"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="300"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="600"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="+Inf"} 1 +snapshot_metadata_controller_operation_total_seconds_sum{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 0.1 +snapshot_metadata_controller_operation_total_seconds_count{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.1"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.25"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="1"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="2.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="10"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="15"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="30"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="60"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="120"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="300"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="600"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="+Inf"} 1 +snapshot_metadata_controller_operation_total_seconds_sum{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 0.1 +snapshot_metadata_controller_operation_total_seconds_count{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.1"} 0 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.25"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="1"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="2.5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="5"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="10"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="15"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="30"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="60"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="120"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="300"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="600"} 1 +snapshot_metadata_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="+Inf"} 1 +snapshot_metadata_controller_operation_total_seconds_sum{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 0.1 +snapshot_metadata_controller_operation_total_seconds_count{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 1 +` + if err := verifyMetric(expected, srvAddr); err != nil { + t.Errorf("failed testing [%v]", err) + } +} + +func TestInFlightMetric(t *testing.T) { + inFlightCheckInterval = time.Millisecond * 50 + + mgr, srv := initMgr() + defer shutdown(srv) + srvAddr := "http://" + srv.Addr + httpPattern + + // Start first operation, should be 1 + opKey := OperationKey{ + Name: "leaked", + ResourceID: types.UID("uid"), + } + opVal := NewOperationValue("driver", "test") + mgr.OperationStart(opKey, opVal) + time.Sleep(500 * time.Millisecond) + + if err := verifyInFlightMetric(`snapshot_metadata_controller_operations_in_flight 1`, srvAddr); err != nil { + t.Errorf("failed testing [%v]", err) + } + + // Start second operation, should be 2 + opKey = OperationKey{ + Name: "leaked2", + ResourceID: types.UID("uid"), + } + opVal = NewOperationValue("driver2", "test2") + mgr.OperationStart(opKey, opVal) + time.Sleep(500 * time.Millisecond) + + if err := verifyInFlightMetric(`snapshot_metadata_controller_operations_in_flight 2`, srvAddr); err != nil { + t.Errorf("failed testing [%v]", err) + } + + // Record, should be down to 1 + mgr.RecordMetrics(opKey, nil, "driver") + time.Sleep(500 * time.Millisecond) + + if err := verifyInFlightMetric(`snapshot_metadata_controller_operations_in_flight 1`, srvAddr); err != nil { + t.Errorf("failed testing [%v]", err) + } + + // Start 50 operations, should be 51 + for i := 0; i < 50; i++ { + opKey := OperationKey{ + Name: fmt.Sprintf("op%d", i), + ResourceID: types.UID("uid%d"), + } + opVal := NewOperationValue("driver1", "test") + mgr.OperationStart(opKey, opVal) + } + time.Sleep(500 * time.Millisecond) + + if err := verifyInFlightMetric(`snapshot_metadata_controller_operations_in_flight 51`, srvAddr); err != nil { + t.Errorf("failed testing [%v]", err) + } +} + +func verifyInFlightMetric(expected string, srvAddr string) error { + rsp, err := http.Get(srvAddr) + if err != nil { + return err + } + if rsp.StatusCode != http.StatusOK { + return fmt.Errorf("failed to get response from serve: %s", http.StatusText(rsp.StatusCode)) + } + r, err := io.ReadAll(rsp.Body) + if err != nil { + return err + } + + if !strings.Contains(string(r), expected) { + return fmt.Errorf("failed, not equal") + } + + return nil +} + +func verifyMetric(expected, srvAddr string) error { + rsp, err := http.Get(srvAddr) + if err != nil { + return err + } + if rsp.StatusCode != http.StatusOK { + return fmt.Errorf("failed to get response from serve: %s", http.StatusText(rsp.StatusCode)) + } + r, err := io.ReadAll(rsp.Body) + if err != nil { + return err + } + + format := expfmt.ResponseFormat(rsp.Header) + gotReader := strings.NewReader(string(r)) + gotDecoder := expfmt.NewDecoder(gotReader, format) + expectedReader := strings.NewReader(expected) + expectedDecoder := expfmt.NewDecoder(expectedReader, format) + + gotMfs := []*cmg.MetricFamily{} + expectedMfs := []*cmg.MetricFamily{} + for { + gotMf := &cmg.MetricFamily{} + gotErr := gotDecoder.Decode(gotMf) + expectedMf := &cmg.MetricFamily{} + if expectedErr := expectedDecoder.Decode(expectedMf); expectedErr != nil { + // return correctly if both are EOF + if expectedErr == io.EOF && gotErr == io.EOF { + break + } else { + return err + } + } + gotMfs = append(gotMfs, gotMf) + expectedMfs = append(expectedMfs, expectedMf) + } + + if !containsMetrics(expectedMfs, gotMfs) { + return fmt.Errorf("failed testing, expected\n%s\n, got\n%s\n", expected, string(r)) + } + + return nil +} + +// sortMfs, sorts metric families in alphabetical order by type. +// currently only supports counter and histogram +func sortMfs(mfs []*cmg.MetricFamily) []*cmg.MetricFamily { + var sortedMfs []*cmg.MetricFamily + + // Sort first by type + sort.Slice(mfs, func(i, j int) bool { + return *mfs[i].Type < *mfs[j].Type + }) + + // Next, sort by length of name + sort.Slice(mfs, func(i, j int) bool { + return len(*mfs[i].Name) < len(*mfs[j].Name) + }) + + return sortedMfs +} + +func containsMetrics(expectedMfs, gotMfs []*cmg.MetricFamily) bool { + if len(gotMfs) != len(expectedMfs) { + fmt.Printf("Not same length: expected and got metrics families: %v vs. %v\n", len(expectedMfs), len(gotMfs)) + return false + } + + // sort metric families for deterministic comparison. + sortedExpectedMfs := sortMfs(expectedMfs) + sortedGotMfs := sortMfs(gotMfs) + + // compare expected vs. sorted actual metrics + for k, got := range sortedGotMfs { + matchCount := 0 + expected := sortedExpectedMfs[k] + + if (got.Name == nil || *(got.Name) != *(expected.Name)) || + (got.Type == nil || *(got.Type) != *(expected.Type)) || + (got.Help == nil || *(got.Help) != *(expected.Help)) { + fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Name, *expected.Name) + fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Type, *expected.Type) + fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Help, *expected.Help) + return false + } + + numRecords := len(expected.Metric) + if len(got.Metric) < numRecords { + fmt.Printf("Not the same number of records: got.Metric: %v, numRecords: %v\n", len(got.Metric), numRecords) + return false + } + for i := 0; i < len(got.Metric); i++ { + for j := 0; j < numRecords; j++ { + if got.Metric[i].Histogram == nil && expected.Metric[j].Histogram != nil || + got.Metric[i].Histogram != nil && expected.Metric[j].Histogram == nil { + fmt.Printf("got metric and expected metric histogram type mismatch") + return false + } + + // labels should be the same + if !reflect.DeepEqual(got.Metric[i].Label, expected.Metric[j].Label) { + continue + } + + // metric type specific checks + switch { + case got.Metric[i].Histogram != nil && expected.Metric[j].Histogram != nil: + gh := got.Metric[i].Histogram + eh := expected.Metric[j].Histogram + if gh == nil || eh == nil { + continue + } + if !reflect.DeepEqual(gh.Bucket, eh.Bucket) { + fmt.Println("got and expected histogram bucket not equal") + continue + } + + // this is a sum record, expecting a latency which is more than the + // expected one. If the sum is smaller than expected, it will be considered + // as NOT a match + if gh.SampleSum == nil || eh.SampleSum == nil || *(gh.SampleSum) < *(eh.SampleSum) { + fmt.Println("difference in sample sum") + continue + } + if gh.SampleCount == nil || eh.SampleCount == nil || *(gh.SampleCount) != *(eh.SampleCount) { + fmt.Println("difference in sample count") + continue + } + + case got.Metric[i].Counter != nil && expected.Metric[j].Counter != nil: + gc := got.Metric[i].Counter + ec := expected.Metric[j].Counter + if gc.Value == nil || *(gc.Value) != *(ec.Value) { + fmt.Println("difference in counter values") + continue + } + } + + // this is a match + matchCount = matchCount + 1 + break + } + } + + if matchCount != numRecords { + fmt.Printf("matchCount %v, numRecords %v\n", matchCount, numRecords) + return false + } + } + + return true +} + +func TestProcessStartTimeMetricExist(t *testing.T) { + mgr, srv := initMgr() + defer shutdown(srv) + metricsFamilies, err := mgr.GetRegistry().Gather() + if err != nil { + t.Fatalf("Error fetching metrics: %v", err) + } + + for _, metricsFamily := range metricsFamilies { + if metricsFamily.GetName() == processStartTimeMetric { + return + } + m := metricsFamily.GetMetric() + if m[0].GetGauge().GetValue() <= 0 { + t.Fatalf("Expected non zero timestamp for process start time") + } + } + + t.Fatalf("Metrics does not contain %v. Scraped content: %v", processStartTimeMetric, metricsFamilies) +}