diff --git a/.github/workflows/scale-test-v2.yaml b/.github/workflows/scale-test-v2.yaml index 27a64479c3..c0f38cf88b 100644 --- a/.github/workflows/scale-test-v2.yaml +++ b/.github/workflows/scale-test-v2.yaml @@ -68,6 +68,14 @@ permissions: contents: read id-token: write +env: + RESOURCE_GROUP: ${{ inputs.resource_group }} + CLUSTER_NAME: ${{ inputs.cluster_name }} + TAG: RetinaVersion + VM_SIZE: Standard_D4_v3 + LOCATION: westus2 + OUTPUT_FILEPATH: ./output.log + jobs: setup-cluster: if: ${{ github.event.inputs.create_cluster == 'true' }} @@ -135,14 +143,6 @@ jobs: shell: bash run: az aks get-credentials --name $CLUSTER_NAME --resource-group $RESOURCE_GROUP --overwrite-existing - - name: Run Scale Test - shell: bash - run: | - set -euo pipefail - # Placeholder for test - sleep 300 & - echo "TEST_PID=$!" >> $GITHUB_ENV - - name: Clone ClusterLoader2 uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: @@ -163,18 +163,29 @@ jobs: mkdir test cp ../../.github/actions/cl2/* ./test -r go build ./cmd/clusterloader.go - ls -l - - name: Run CL2 + - name: Run Scale Test shell: bash + env: + AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION }} + AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }} run: | set -euo pipefail - cd perf-tests/clusterloader2 - ./clusterloader --testconfig=./test/config.yaml --provider=aks --kubeconfig=$HOME/.kube/config --v=2 --report-dir=./report - - - name: Stop test - shell: bash - run: kill $TEST_PID + # Placeholder for test + go test ./test/e2e/. -v -tags=scale -timeout 300s -args -image-tag=$(make version) -create-infra=false -delete-infra=false + + # - name: Run CL2 + # shell: bash + # run: | + # set -euo pipefail + # cd perf-tests/clusterloader2 + # ./clusterloader --testconfig=./test/config.yaml --provider=aks --kubeconfig=$HOME/.kube/config --v=2 --report-dir=./report + + # - name: Stop test + # shell: bash + # run: | + # PID=$(ps aux | grep "go test" | awk '{print $2}') + # kill -s 15 $PID cleanup: name: Cleanup diff --git a/test/e2e/framework/scaletest/clusterloader2.go b/test/e2e/framework/scaletest/clusterloader2.go new file mode 100644 index 0000000000..cc051cc544 --- /dev/null +++ b/test/e2e/framework/scaletest/clusterloader2.go @@ -0,0 +1,39 @@ +package scaletest + +import ( + "fmt" + "os" + "os/exec" +) + +type ClusterLoader2 struct{} + +func (d *ClusterLoader2) Prevalidate() error { + return nil +} + +func (d *ClusterLoader2) Run() error { + args := []string{ + "--testconfig=../../perf-tests/clusterloader2/test/config.yaml", + "--provider=aks", + "--kubeconfig=/home/runner/.kube/config", + "--v=2", + "--report-dir=../../perf-tests/clusterloader2/report", + } + cl2Path := "../../perf-tests/clusterloader2/clusterloader" + cmd := exec.Command(cl2Path, args...) + + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + err := cmd.Run() + if err != nil { + return fmt.Errorf("Error executing CL2: %w", err) + } + + return nil +} + +func (d *ClusterLoader2) Stop() error { + return nil +} diff --git a/test/e2e/framework/scaletest/get-publish-metrics.go b/test/e2e/framework/scaletest/get-publish-metrics.go index 3495addf33..271ff6f6dd 100644 --- a/test/e2e/framework/scaletest/get-publish-metrics.go +++ b/test/e2e/framework/scaletest/get-publish-metrics.go @@ -44,6 +44,7 @@ func (g *GetAndPublishMetrics) Run() error { } g.stop = make(chan struct{}) + g.wg.Add(1) go func() { @@ -66,7 +67,6 @@ func (g *GetAndPublishMetrics) Run() error { } } - }() return nil diff --git a/test/e2e/framework/scaletest/options.go b/test/e2e/framework/scaletest/options.go index 6b5284422b..f6f4bdea0f 100644 --- a/test/e2e/framework/scaletest/options.go +++ b/test/e2e/framework/scaletest/options.go @@ -1,40 +1,8 @@ package scaletest -import "time" - // Options holds parameters for the scale test type Options struct { - Namespace string - MaxKwokPodsPerNode int - NumKwokDeployments int - NumKwokReplicas int - MaxRealPodsPerNode int - NumRealDeployments int - RealPodType string - NumRealReplicas int - NumRealServices int - NumNetworkPolicies int - NumUnapliedNetworkPolicies int - NumUniqueLabelsPerPod int - NumUniqueLabelsPerDeployment int - NumSharedLabelsPerPod int - KubeconfigPath string - RestartNpmPods bool - DebugExitAfterPrintCounts bool - DebugExitAfterGeneration bool - SleepAfterCreation time.Duration - DeleteKwokPods bool - DeleteRealPods bool - DeletePodsInterval time.Duration - DeletePodsTimes int - DeleteLabels bool - DeleteLabelsInterval time.Duration - DeleteLabelsTimes int - DeleteNetworkPolicies bool - DeleteNetworkPoliciesInterval time.Duration - DeleteNetworkPoliciesTimes int - numKwokPods int - numRealPods int - LabelsToGetMetrics map[string]string - AdditionalTelemetryProperty map[string]string + KubeconfigPath string + LabelsToGetMetrics map[string]string + AdditionalTelemetryProperty map[string]string } diff --git a/test/e2e/framework/scaletest/validate-options.go b/test/e2e/framework/scaletest/validate-options.go deleted file mode 100644 index 0dafdd2b06..0000000000 --- a/test/e2e/framework/scaletest/validate-options.go +++ /dev/null @@ -1,49 +0,0 @@ -package scaletest - -import ( - "errors" - "log" -) - -type ValidateAndPrintOptions struct { - Options *Options -} - -// Useful when wanting to do parameter checking, for example -// if a parameter length is known to be required less than 80 characters, -// do this here so we don't find out later on when we run the step -// when possible, try to avoid making external calls, this should be fast and simple -func (po *ValidateAndPrintOptions) Prevalidate() error { - if po.Options.MaxKwokPodsPerNode < 0 || - po.Options.NumKwokDeployments < 0 || - po.Options.NumKwokReplicas < 0 || - po.Options.MaxRealPodsPerNode < 0 || - po.Options.NumRealDeployments < 0 || - po.Options.NumRealReplicas < 0 || - po.Options.NumNetworkPolicies < 0 || - po.Options.NumUnapliedNetworkPolicies < 0 || - po.Options.NumUniqueLabelsPerPod < 0 || - po.Options.NumUniqueLabelsPerDeployment < 0 || - po.Options.NumSharedLabelsPerPod < 0 { - return errors.New("invalid negative value option for Scale step") - } - - if po.Options.NumNetworkPolicies > 0 && po.Options.NumSharedLabelsPerPod < 3 { - return errors.New("NumSharedLabelsPerPod must be at least 3 when NumNetworkPolicies > 0 because of the way Network Policies are generated") - } - - return nil -} - -// Returning an error will cause the test to fail -func (po *ValidateAndPrintOptions) Run() error { - - log.Printf("Starting to scale with folowing options: %+v", po.Options) - - return nil -} - -// Require for background steps -func (po *ValidateAndPrintOptions) Stop() error { - return nil -} diff --git a/test/e2e/jobs/scale.go b/test/e2e/jobs/scale.go index 89215785c1..a78ccf7cae 100644 --- a/test/e2e/jobs/scale.go +++ b/test/e2e/jobs/scale.go @@ -2,116 +2,36 @@ package retina import ( "os" - "time" - "github.com/microsoft/retina/test/e2e/framework/kubernetes" "github.com/microsoft/retina/test/e2e/framework/scaletest" "github.com/microsoft/retina/test/e2e/framework/types" ) func DefaultScaleTestOptions() scaletest.Options { return scaletest.Options{ - Namespace: "scale-test", - MaxKwokPodsPerNode: 0, - NumKwokDeployments: 0, - NumKwokReplicas: 0, - MaxRealPodsPerNode: 100, - NumRealDeployments: 1000, - RealPodType: "kapinger", - NumRealReplicas: 40, - NumRealServices: 1000, - NumNetworkPolicies: 10, - NumUnapliedNetworkPolicies: 10, - NumUniqueLabelsPerPod: 0, - NumUniqueLabelsPerDeployment: 1, - NumSharedLabelsPerPod: 3, - KubeconfigPath: "", - RestartNpmPods: false, - SleepAfterCreation: 0, - DeleteKwokPods: false, - DeletePodsInterval: 60 * time.Second, - DeleteRealPods: false, - DeletePodsTimes: 1, - DeleteLabels: false, - DeleteLabelsInterval: 60 * time.Second, - DeleteLabelsTimes: 1, - DeleteNetworkPolicies: false, - DeleteNetworkPoliciesInterval: 60 * time.Second, - DeleteNetworkPoliciesTimes: 1, - LabelsToGetMetrics: map[string]string{}, - AdditionalTelemetryProperty: map[string]string{}, + LabelsToGetMetrics: map[string]string{}, + AdditionalTelemetryProperty: map[string]string{}, } } func ScaleTest(opt *scaletest.Options) *types.Job { job := types.NewJob("Scale Test") - job.AddStep(&scaletest.ValidateAndPrintOptions{ - Options: opt, - }, nil) - - job.AddStep(&scaletest.ValidateNumOfNodes{ - KubeConfigFilePath: opt.KubeconfigPath, - Label: map[string]string{"scale-test": "true"}, - NumNodesRequired: (opt.NumRealDeployments*opt.NumRealReplicas + - opt.MaxRealPodsPerNode - 1) / opt.MaxRealPodsPerNode, - }, nil) - - job.AddStep(&kubernetes.DeleteNamespace{ - Namespace: opt.Namespace, - }, nil) - - job.AddStep(&kubernetes.CreateNamespace{}, nil) - job.AddStep(&scaletest.GetAndPublishMetrics{ + KubeConfigFilePath: opt.KubeconfigPath, Labels: opt.LabelsToGetMetrics, AdditionalTelemetryProperty: opt.AdditionalTelemetryProperty, - OutputFilePath: os.Getenv("OUTPUT_FILEPATH"), + OutputFilePath: os.Getenv("OUTPUT_FILEPATH"), }, &types.StepOptions{ SkipSavingParametersToJob: true, - RunInBackgroundWithID: "get-metrics", + RunInBackgroundWithID: "metrics", }) - job.AddStep(&scaletest.CreateResources{ - NumKwokDeployments: opt.NumKwokDeployments, - NumKwokReplicas: opt.NumKwokReplicas, - RealPodType: opt.RealPodType, - NumRealDeployments: opt.NumRealDeployments, - NumRealReplicas: opt.NumRealReplicas, - NumRealServices: opt.NumRealServices, - NumUniqueLabelsPerDeployment: opt.NumUniqueLabelsPerDeployment, - }, nil) - - job.AddStep(&scaletest.AddSharedLabelsToAllPods{ - NumSharedLabelsPerPod: opt.NumSharedLabelsPerPod, - }, nil) - - job.AddStep(&scaletest.AddUniqueLabelsToAllPods{ - NumUniqueLabelsPerPod: opt.NumUniqueLabelsPerPod, - }, nil) - - // Apply network policies (applied and unapplied) - job.AddStep(&scaletest.CreateNetworkPolicies{ - NumNetworkPolicies: opt.NumNetworkPolicies, - NumSharedLabelsPerPod: opt.NumSharedLabelsPerPod, - }, nil) - - job.AddStep(&kubernetes.WaitPodsReady{ - LabelSelector: "is-real=true", - }, nil) - - job.AddStep(&scaletest.DeleteAndReAddLabels{ - DeleteLabels: opt.DeleteLabels, - DeleteLabelsInterval: opt.DeleteLabelsInterval, - DeleteLabelsTimes: opt.DeleteLabelsTimes, - NumSharedLabelsPerPod: opt.NumSharedLabelsPerPod, - }, nil) + job.AddStep(&scaletest.ClusterLoader2{}, nil) job.AddStep(&types.Stop{ - BackgroundID: "get-metrics", + BackgroundID: "metrics", }, nil) - job.AddStep(&kubernetes.DeleteNamespace{}, nil) - return job } diff --git a/test/e2e/scale_test.go b/test/e2e/scale_test.go index 6769dccc09..4cbae8baef 100644 --- a/test/e2e/scale_test.go +++ b/test/e2e/scale_test.go @@ -3,11 +3,8 @@ package retina import ( - "crypto/rand" - "math/big" "os" "path/filepath" - "strconv" "testing" "github.com/microsoft/retina/test/e2e/common" @@ -28,57 +25,19 @@ func TestE2ERetina_Scale(t *testing.T) { subID := os.Getenv("AZURE_SUBSCRIPTION_ID") require.NotEmpty(t, subID) - location := os.Getenv("AZURE_LOCATION") - if location == "" { - nBig, err := rand.Int(rand.Reader, big.NewInt(int64(len(common.AzureLocations)))) - if err != nil { - t.Fatal("Failed to generate a secure random index", err) - } - location = common.AzureLocations[nBig.Int64()] - } - rg := os.Getenv("AZURE_RESOURCE_GROUP") if rg == "" { // Use the cluster name as the resource group name by default. rg = clusterName } - cwd, err := os.Getwd() - require.NoError(t, err) - - // Get to root of the repo by going up two directories - rootDir := filepath.Dir(filepath.Dir(cwd)) - - chartPath := filepath.Join(rootDir, "deploy", "legacy", "manifests", "controller", "helm", "retina") - kubeConfigFilePath := filepath.Join(rootDir, "test", "e2e", "test.pem") + kubeConfigFilePath := filepath.Join(os.Getenv("HOME"), ".kube", "config") // Scale test parameters opt := jobs.DefaultScaleTestOptions() opt.KubeconfigPath = kubeConfigFilePath - NumDeployments := os.Getenv("NUM_DEPLOYMENTS") - NumReplicas := os.Getenv("NUM_REPLICAS") - NumNetworkPolicies := os.Getenv("NUM_NET_POL") - CleanUp := os.Getenv("CLEANUP") - - if NumDeployments != "" { - opt.NumRealDeployments, err = strconv.Atoi(NumDeployments) - opt.NumRealServices = opt.NumRealDeployments - require.NoError(t, err) - } - if NumReplicas != "" { - opt.NumRealReplicas, err = strconv.Atoi(NumReplicas) - require.NoError(t, err) - } - if NumNetworkPolicies != "" { - opt.NumNetworkPolicies, err = strconv.Atoi(NumNetworkPolicies) - require.NoError(t, err) - } - if CleanUp != "" { - opt.DeleteLabels, err = strconv.ParseBool(CleanUp) - require.NoError(t, err) - } - + // TODO: Get Retina Version from cluster or change ENV VAR RetinaVersion := os.Getenv(generic.DefaultTagEnv) require.NotEmpty(t, RetinaVersion) opt.AdditionalTelemetryProperty["retinaVersion"] = RetinaVersion @@ -87,30 +46,13 @@ func TestE2ERetina_Scale(t *testing.T) { // AppInsightsKey is required for telemetry require.NotEmpty(t, os.Getenv(common.AzureAppInsightsKeyEnv)) + // Agent label opt.LabelsToGetMetrics = map[string]string{"k8s-app": "retina"} - // CreateTestInfra - createTestInfra := types.NewRunner(t, jobs.CreateTestInfra(subID, rg, clusterName, location, kubeConfigFilePath, *common.CreateInfra)) - createTestInfra.Run(ctx) - - t.Cleanup(func() { - if *common.DeleteInfra { - _ = jobs.DeleteTestInfra(subID, rg, clusterName, location).Run() - } - }) - fqdn, err := azure.GetFqdnFn(subID, rg, clusterName) require.NoError(t, err) opt.AdditionalTelemetryProperty["clusterFqdn"] = fqdn - // Install Retina - installRetina := types.NewRunner(t, jobs.InstallRetina(kubeConfigFilePath, chartPath)) - installRetina.Run(ctx) - - t.Cleanup(func() { - _ = jobs.UninstallRetina(kubeConfigFilePath, chartPath).Run() - }) - scale := types.NewRunner(t, jobs.ScaleTest(&opt)) scale.Run(ctx) }