From 0761d4b63d1b013d28564e52d92d7ed698eccc88 Mon Sep 17 00:00:00 2001
From: Wei Fu <weifu@microsoft.com>
Date: Tue, 23 Apr 2024 09:12:50 +0000
Subject: [PATCH] *: init node100_dp5_pod10k for runkperf bench

Signed-off-by: Wei Fu <weifu@microsoft.com>
---
 .../commands/bench/node100_dp5_pod10k.go      | 108 ++++++++++++++++++
 contrib/cmd/runkperf/commands/bench/root.go   |   1 +
 .../loadprofile/node100_dp5_pod10k.yaml       |  40 +++++++
 3 files changed, 149 insertions(+)
 create mode 100644 contrib/cmd/runkperf/commands/bench/node100_dp5_pod10k.go
 create mode 100644 contrib/internal/manifests/loadprofile/node100_dp5_pod10k.yaml

diff --git a/contrib/cmd/runkperf/commands/bench/node100_dp5_pod10k.go b/contrib/cmd/runkperf/commands/bench/node100_dp5_pod10k.go
new file mode 100644
index 0000000..60b8e50
--- /dev/null
+++ b/contrib/cmd/runkperf/commands/bench/node100_dp5_pod10k.go
@@ -0,0 +1,108 @@
+package bench
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	internaltypes "github.com/Azure/kperf/contrib/internal/types"
+	"github.com/Azure/kperf/contrib/internal/utils"
+
+	"github.com/urfave/cli"
+)
+
+var benchNode100Deployment5Pod10KCase = cli.Command{
+	Name: "node100_dp5_pod10k",
+	Usage: `
+
+The test suite is to setup 100 virtual nodes and deploy 5 deployments for 10k
+pods on that nodes. It repeats to rolling-update deployments one by one during
+benchmark.
+	`,
+	Flags: []cli.Flag{
+		cli.IntFlag{
+			Name:  "total",
+			Usage: "Total requests per runner (There are 10 runners totally and runner's rate is 10)",
+			Value: 36000,
+		},
+		cli.IntFlag{
+			Name:  "podsize",
+			Usage: "Add <key=data, value=randomStringByLen(podsize)> in pod's annotation to increase pod size. The value is close to pod's size",
+			Value: 0,
+		},
+	},
+	Action: func(cliCtx *cli.Context) error {
+		_, err := renderBenchmarkReportInterceptor(
+			addAPIServerCoresInfoInterceptor(benchNode100Deployment5Pod10KRun),
+		)(cliCtx)
+		return err
+	},
+}
+
+// benchNode100Deployment5Pod10KCase is for subcommand benchNode100Deployment5Pod10KCase.
+func benchNode100Deployment5Pod10KRun(cliCtx *cli.Context) (*internaltypes.BenchmarkReport, error) {
+	ctx := context.Background()
+	kubeCfgPath := cliCtx.GlobalString("kubeconfig")
+
+	rgCfgFile, rgSpec, rgCfgFileDone, err := newLoadProfileFromEmbed(cliCtx,
+		"loadprofile/node100_dp5_pod10k.yaml")
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = rgCfgFileDone() }()
+
+	vcDone, err := deployVirtualNodepool(ctx, cliCtx, "node100dp5pod10k", 100, 150)
+	if err != nil {
+		return nil, fmt.Errorf("failed to deploy virtual node: %w", err)
+	}
+	defer func() { _ = vcDone() }()
+
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	restartInterval := 10 * time.Second
+	dpCtx, dpCancel := context.WithCancel(ctx)
+
+	podSize := cliCtx.Int("podsize")
+	rollingUpdateFn, err := utils.RepeatRollingUpdate10KPod(dpCtx, kubeCfgPath, "dp5pod10k", podSize, restartInterval)
+	if err != nil {
+		dpCancel()
+		return nil, fmt.Errorf("failed to setup workload: %w", err)
+	}
+
+	go func() {
+		defer wg.Done()
+
+		// FIXME(weifu):
+		//
+		// DeployRunnerGroup should return ready notification.
+		// The rolling update should run after runners.
+		rollingUpdateFn()
+	}()
+
+	rgResult, derr := utils.DeployRunnerGroup(ctx,
+		cliCtx.GlobalString("kubeconfig"),
+		cliCtx.GlobalString("runner-image"),
+		rgCfgFile,
+		cliCtx.GlobalString("runner-flowcontrol"),
+		cliCtx.GlobalString("rg-affinity"),
+	)
+	dpCancel()
+	wg.Wait()
+
+	if derr != nil {
+		return nil, derr
+	}
+
+	return &internaltypes.BenchmarkReport{
+		Description: fmt.Sprintf(`
+Environment: 100 virtual nodes managed by kwok-controller,
+Workload: Deploy 5 deployments with 10,000 pods. Rolling-update deployments one by one and the interval is %v`, restartInterval),
+		LoadSpec: *rgSpec,
+		Result:   *rgResult,
+		Info: map[string]interface{}{
+			"podSizeInBytes": podSize,
+		},
+	}, nil
+}
diff --git a/contrib/cmd/runkperf/commands/bench/root.go b/contrib/cmd/runkperf/commands/bench/root.go
index 0a7193e..945d7e0 100644
--- a/contrib/cmd/runkperf/commands/bench/root.go
+++ b/contrib/cmd/runkperf/commands/bench/root.go
@@ -53,5 +53,6 @@ var Command = cli.Command{
 	},
 	Subcommands: []cli.Command{
 		benchNode100Job1Pod3KCase,
+		benchNode100Deployment5Pod10KCase,
 	},
 }
diff --git a/contrib/internal/manifests/loadprofile/node100_dp5_pod10k.yaml b/contrib/internal/manifests/loadprofile/node100_dp5_pod10k.yaml
new file mode 100644
index 0000000..4b75cd5
--- /dev/null
+++ b/contrib/internal/manifests/loadprofile/node100_dp5_pod10k.yaml
@@ -0,0 +1,40 @@
+count: 10
+loadProfile:
+  version: 1
+  description: "node100-deployment5-pod10k"
+  spec:
+    rate: 10
+    total: 36000
+    conns: 10
+    client: 100
+    contentType: json
+    disableHTTP2: false
+    maxRetries: 0
+    requests:
+      - staleList:
+          version: v1
+          resource: pods
+          # NOTE: Please align with ../../utils/utils.go#RepeatRollingUpdate10KPod
+          seletor: "app=benchmark"
+          # NOTE: Please align with ../../../cmd/runkperf/commands/bench/node100_dp5_pod10k.go.
+          # And there are only 100 nodes and each node can run 150 pods. It should
+          # have items in the response.
+          fieldSelector: "spec.nodeName=node100dp5pod10k-49"
+        shares: 1000 # 1000 / (1000 + 100 + 200) * 10 = 7.7 req/s
+      - staleList:
+          version: v1
+          resource: pods
+        shares: 100 # 100 / (1000 + 100 + 200) * 10 = 0.7 req/s
+      - quorumList:
+          version: v1
+          resource: pods
+          namespace: benchmark-0
+          # NOTE: It's to simulate the request created by daemonset to get pods,
+          # including kubelet, when they want to get pods from ETCD. The limit
+          # is 100 because it's close to MaxPods value.
+          limit: 100
+          # NOTE: Please align with ../../../cmd/runkperf/commands/bench/node100_dp5_pod10k.go.
+          fieldSelector: "spec.nodeName=node100dp5pod10k-49"
+          # And there are only 100 nodes and each node can run 150 pods. It should
+          # have items in the response.
+        shares: 200 # 200 / (1000 + 100 + 200) * 10 = 1.5 req/s