From 71d7f074440826bccafcb9f06d63ea8be8d68d64 Mon Sep 17 00:00:00 2001 From: anish-mudaraddi Date: Thu, 16 Jan 2025 14:44:03 +0000 Subject: [PATCH 1/3] change rollout strategy to avoid deleting worker nodes fully create upgraded worker node before deleting old one. Allows things like longhorn volumes to migrate better --- charts/dev/capi-infra/values.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/charts/dev/capi-infra/values.yaml b/charts/dev/capi-infra/values.yaml index e9a79e94..e9831a51 100644 --- a/charts/dev/capi-infra/values.yaml +++ b/charts/dev/capi-infra/values.yaml @@ -354,6 +354,19 @@ openstack-cluster: autoscale: false machineFlavor: l3.micro + rolloutStrategy: + type: RollingUpdate + rollingUpdate: + # The maximum number of node group machines that can be unavailable during the update + # Can be an absolute number or a percentage of the desired count + maxUnavailable: 0 + # The maximum number of machines that can be scheduled above the desired count for + # the group during an update + # Can be an absolute number or a percentage of the desired count + maxSurge: 1 + # One of Random, Newest, Oldest + deletePolicy: Random + healthCheck: enabled: true spec: From e044ed285b5190a30574fc0297f8ca0bd1eed62c Mon Sep 17 00:00:00 2001 From: anish-mudaraddi Date: Thu, 16 Jan 2025 14:35:01 +0000 Subject: [PATCH 2/3] Promote dev capi changes to staging promote all dev changes to capi-infra chart to staging --- charts/staging/capi-infra/Chart.yaml | 2 +- charts/staging/capi-infra/values.yaml | 17 +++++++++++++++-- clusters/dev/worker/infra-values.yaml | 1 - clusters/staging/management/infra-values.yaml | 12 ++++++------ clusters/staging/worker/infra-values.yaml | 19 ++++++++++++------- 5 files changed, 34 insertions(+), 17 deletions(-) diff --git a/charts/staging/capi-infra/Chart.yaml b/charts/staging/capi-infra/Chart.yaml index d4688bb7..52fe2ee9 100644 --- a/charts/staging/capi-infra/Chart.yaml +++ b/charts/staging/capi-infra/Chart.yaml @@ -4,4 +4,4 @@ version: 1.3.0 dependencies: - repository: https://azimuth-cloud.github.io/capi-helm-charts name: openstack-cluster - version: 0.11.2 + version: 0.12.2 diff --git a/charts/staging/capi-infra/values.yaml b/charts/staging/capi-infra/values.yaml index 8ef58926..e9831a51 100644 --- a/charts/staging/capi-infra/values.yaml +++ b/charts/staging/capi-infra/values.yaml @@ -1,6 +1,6 @@ openstack-cluster: - kubernetesVersion: "1.30.6" - machineImage: "capi-ubuntu-2204-kube-v1.30.6-2024-11-15" + kubernetesVersion: "1.31.4" + machineImage: "capi-ubuntu-2204-kube-v1.31.4-2025-01-07" # The PEM-encoded CA certificate for openstack.stfc.ac.uk # this expires 2023-12-05T23:59:59Z (UTC) @@ -354,6 +354,19 @@ openstack-cluster: autoscale: false machineFlavor: l3.micro + rolloutStrategy: + type: RollingUpdate + rollingUpdate: + # The maximum number of node group machines that can be unavailable during the update + # Can be an absolute number or a percentage of the desired count + maxUnavailable: 0 + # The maximum number of machines that can be scheduled above the desired count for + # the group during an update + # Can be an absolute number or a percentage of the desired count + maxSurge: 1 + # One of Random, Newest, Oldest + deletePolicy: Random + healthCheck: enabled: true spec: diff --git a/clusters/dev/worker/infra-values.yaml b/clusters/dev/worker/infra-values.yaml index 752ed526..67fc841c 100644 --- a/clusters/dev/worker/infra-values.yaml +++ b/clusters/dev/worker/infra-values.yaml @@ -9,7 +9,6 @@ openstack-cluster: machineFlavor: l3.micro nodeGroupDefaults: - machineFlavor: l3.nano nodeLabels: # we're running longhorn on this cluster # set label so worker nodes can host longhorn volumes diff --git a/clusters/staging/management/infra-values.yaml b/clusters/staging/management/infra-values.yaml index 3fc81f7a..b0eb41fa 100644 --- a/clusters/staging/management/infra-values.yaml +++ b/clusters/staging/management/infra-values.yaml @@ -24,25 +24,25 @@ openstack-cluster: env: staging ingress: hosts: - - prometheus-mgmt.staging.nubes.stfc.ac.uk + - prometheus.staging-mgmt.nubes.stfc.ac.uk tls: - hosts: - - prometheus-mgmt.staging.nubes.stfc.ac.uk + - prometheus.staging-mgmt.nubes.stfc.ac.uk secretName: tls-keypair grafana: ingress: hosts: - - grafana-mgmt.staging.nubes.stfc.ac.uk + - grafana.staging-mgmt.nubes.stfc.ac.uk tls: - hosts: - - grafana-mgmt.staging.nubes.stfc.ac.uk + - grafana.staging-mgmt.nubes.stfc.ac.uk secretName: tls-keypair alertmanager: enabled: true ingress: hosts: - - alertmanager-mgmt.staging.nubes.stfc.ac.uk + - alertmanager.staging-mgmt.nubes.stfc.ac.uk tls: - hosts: - - alertmanager-mgmt.staging.nubes.stfc.ac.uk + - alertmanager.staging-mgmt.nubes.stfc.ac.uk secretName: tls-keypair diff --git a/clusters/staging/worker/infra-values.yaml b/clusters/staging/worker/infra-values.yaml index 412df239..dfe1f38e 100644 --- a/clusters/staging/worker/infra-values.yaml +++ b/clusters/staging/worker/infra-values.yaml @@ -1,8 +1,12 @@ openstack-cluster: + controlPlane: + machineCount: 3 + nodeGroups: - name: default-md-0 - machineCount: 5 + machineCount: 3 + machineFlavor: l3.micro nodeGroupDefaults: machineFlavor: l3.nano @@ -22,6 +26,7 @@ openstack-cluster: loadBalancerIP: "130.246.81.242" monitoring: + enabled: true # no need to send alerts around certs/openstack API endpoints for dev/staging clusters # ends up with too many messages in the ticket queue blackBoxExporter: @@ -36,25 +41,25 @@ openstack-cluster: env: staging ingress: hosts: - - prometheus-worker.staging.nubes.stfc.ac.uk + - prometheus.staging-worker.nubes.stfc.ac.uk tls: - hosts: - - prometheus-worker.staging.nubes.stfc.ac.uk + - prometheus.staging-worker.nubes.stfc.ac.uk secretName: tls-keypair grafana: ingress: hosts: - - grafana-worker.staging.nubes.stfc.ac.uk + - grafana.staging-worker.nubes.stfc.ac.uk tls: - hosts: - - grafana-worker.staging.nubes.stfc.ac.uk + - grafana.staging-worker.nubes.stfc.ac.uk secretName: tls-keypair alertmanager: enabled: true ingress: hosts: - - alertmanager-worker.staging.nubes.stfc.ac.uk + - alertmanager.staging-worker.nubes.stfc.ac.uk tls: - hosts: - - alertmanager-worker.staging.nubes.stfc.ac.uk + - alertmanager.staging-worker.nubes.stfc.ac.uk secretName: tls-keypair \ No newline at end of file From 213b13cc23adbbccd3760c9c056d3ae0273058c4 Mon Sep 17 00:00:00 2001 From: anish-mudaraddi Date: Thu, 16 Jan 2025 14:41:58 +0000 Subject: [PATCH 3/3] promote longhorn changes bump to 1.7.1 tweaks to values to what they were expected --- charts/staging/longhorn/Chart.yaml | 5 +++++ charts/staging/longhorn/requirements.yaml | 5 ----- charts/staging/longhorn/values.yaml | 9 +++++---- 3 files changed, 10 insertions(+), 9 deletions(-) delete mode 100644 charts/staging/longhorn/requirements.yaml diff --git a/charts/staging/longhorn/Chart.yaml b/charts/staging/longhorn/Chart.yaml index 2a76c063..289c3e55 100644 --- a/charts/staging/longhorn/Chart.yaml +++ b/charts/staging/longhorn/Chart.yaml @@ -1,3 +1,8 @@ apiVersion: v2 name: longhorn version: 1.0.0 +dependencies: +# https://github.com/longhorn/charts/releases +- name: longhorn + version: 1.7.1 + repository: https://charts.longhorn.io diff --git a/charts/staging/longhorn/requirements.yaml b/charts/staging/longhorn/requirements.yaml deleted file mode 100644 index 1fb12ee8..00000000 --- a/charts/staging/longhorn/requirements.yaml +++ /dev/null @@ -1,5 +0,0 @@ -dependencies: -# https://github.com/longhorn/charts/releases -- name: longhorn - version: 1.6.2 - repository: https://charts.longhorn.io diff --git a/charts/staging/longhorn/values.yaml b/charts/staging/longhorn/values.yaml index a842c562..95e81691 100644 --- a/charts/staging/longhorn/values.yaml +++ b/charts/staging/longhorn/values.yaml @@ -21,14 +21,15 @@ longhorn: defaultSettings: taintToleration: "nvidia.com/gpu:NoSchedule" snapshotMaxCount: 10 - snapshotDataIntegrity: true - snapshotDataIntegtrityCronjob: true - replicaAutoBalance: true + snapshotDataIntegrity: "enabled" + snapshotDataIntegrityCronjob: "0 12 * * 1" + replicaAutoBalance: "best-effort" autoDeletePodWhenVolumeDetachedUnexpectedly: true allowVolumeCreationWithDegradedAvailability: true + nodeDrainPolicy: "block-for-eviction" persistence: defaultClassReplicaCount: 3 defaultDataLocality: disabled migratable: "true" - \ No newline at end of file +