Skip to content

Commit

Permalink
feat: fabric and gateway health probe
Browse files Browse the repository at this point in the history
  • Loading branch information
cheina97 committed Jan 10, 2025
1 parent 09b5e04 commit 57c0926
Show file tree
Hide file tree
Showing 11 changed files with 139 additions and 8 deletions.
9 changes: 9 additions & 0 deletions cmd/fabric/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/config"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/metrics/server"

Expand Down Expand Up @@ -142,6 +143,14 @@ func run(cmd *cobra.Command, _ []string) error {
return fmt.Errorf("unable to create manager: %w", err)
}

// Register the healthiness probes.
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
return fmt.Errorf("unable to set up healthz probe: %w", err)
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
return fmt.Errorf("unable to set up readyz probe: %w", err)
}

gwr, err := sourcedetector.NewGatewayReconciler(
mgr.GetClient(),
mgr.GetScheme(),
Expand Down
9 changes: 9 additions & 0 deletions cmd/gateway/geneve/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client/config"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/metrics/server"

Expand Down Expand Up @@ -95,6 +96,14 @@ func run(cmd *cobra.Command, _ []string) error {
return fmt.Errorf("unable to create manager: %w", err)
}

// Register the healthiness probes.
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
return fmt.Errorf("unable to set up healthz probe: %w", err)
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
return fmt.Errorf("unable to set up readyz probe: %w", err)
}

inr, err := geneve.NewInternalNodeReconciler(
mgr.GetClient(),
mgr.GetScheme(),
Expand Down
9 changes: 9 additions & 0 deletions cmd/gateway/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/config"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/metrics/server"

Expand Down Expand Up @@ -160,6 +161,14 @@ func run(cmd *cobra.Command, _ []string) error {
return fmt.Errorf("unable to create manager: %w", err)
}

// Register the healthiness probes.
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
return fmt.Errorf("unable to set up healthz probe: %w", err)
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
return fmt.Errorf("unable to set up readyz probe: %w", err)
}

if connoptions.EnableConnectionController {
// Setup the connection controller.
connr, err := connection.NewConnectionsReconciler(
Expand Down
9 changes: 9 additions & 0 deletions cmd/gateway/wireguard/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/client/config"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/metrics/server"
Expand Down Expand Up @@ -107,6 +108,14 @@ func run(cmd *cobra.Command, _ []string) error {
return fmt.Errorf("unable to create manager: %w", err)
}

// Register the healthiness probes.
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
return fmt.Errorf("unable to set up healthz probe: %w", err)
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
return fmt.Errorf("unable to set up readyz probe: %w", err)
}

// Setup the controller.
pkr, err := wireguard.NewPublicKeysReconciler(
mgr.GetClient(),
Expand Down
2 changes: 2 additions & 0 deletions deployments/liqo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@
| networking.enabled | bool | `true` | Use the default Liqo networking module. |
| networking.fabric.config.fullMasquerade | bool | `false` | Enabe/Disable the full masquerade mode for the fabric pod. It means that all traffic will be masquerade using the first external cidr IP, instead of using the pod IP. Full masquerade is useful when the cluster nodeports uses a PodCIDR IP to masqerade the incoming traffic. IMPORTANT: Please consider that enabling this feature will masquerade the source IP of traffic towards a remote cluster, making impossible for a pod that receives the traffic to know the original source IP. |
| networking.fabric.config.gatewayMasqueradeBypass | bool | `false` | Enable/Disable the masquerade bypass for the gateway pods. It means that the packets from gateway pods will not be masqueraded from the host where the pod is scheduled. This is useful in scenarios where CNIs masquerade the traffic from pod to nodes. For example this is required when using the Azure CNI or Kindnet. |
| networking.fabric.config.healthProbeBindAddressPort | string | `"8081"` | Set the port where the fabric pod will expose the health probe. To disable the health probe, set the port to 0. |
| networking.fabric.config.metricsAddressPort | string | `"8082"` | Set the port where the fabric pod will expose the metrics. To disable the metrics, set the port to 0. |
| networking.fabric.config.nftablesMonitor | bool | `true` | Enable/Disable the nftables monitor for the fabric pod. It means that the fabric pod will monitor the nftables rules and will restore them in case of changes. In some cases (like K3S), this monitor can cause a huge amount of CPU usage. If you are experiencing high CPU usage, you can disable this feature. |
| networking.fabric.image.name | string | `"ghcr.io/liqotech/fabric"` | Image repository for the fabric pod. |
| networking.fabric.image.version | string | `""` | Custom version for the fabric image. If not specified, the global tag is used. |
Expand Down
12 changes: 12 additions & 0 deletions deployments/liqo/templates/liqo-fabric-daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ spec:
- --podname=$(POD_NAME)
- --nodename=$(NODE_NAME)
- --geneve-port={{ .Values.networking.genevePort }}
- --health-probe-bind-address=:{{ .Values.networking.fabric.config.healthProbeBindAddressPort}}
- --metrics-address=:{{ .Values.networking.fabric.config.metricsAddressPort}}
{{- if not .Values.requirements.kernel.enabled }}
- --disable-kernel-version-check
{{- end }}
Expand Down Expand Up @@ -79,6 +81,16 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
{{- if and .Values.networking.fabric.config.healthProbeBindAddressPort (ne .Values.networking.fabric.config.healthProbeBindAddressPort "0") }}
ports:
- name: healthz
containerPort: {{ .Values.networking.fabric.config.healthProbeBindAddressPort }}
protocol: TCP
readinessProbe:
httpGet:
path: /readyz
port: healthz
{{- end }}
hostNetwork: true
{{- if .Values.networking.fabric.pod.priorityClassName }}
priorityClassName: {{ .Values.networking.fabric.pod.priorityClassName }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ spec:
- containerPort: 8082
name: gw-metrics
{{- end }}
- containerPort: 8083
name: healthz
# ATTENTION: uncomment the readinessProbe section if you are aware of the consequences.
# If you have more replicas of the same gateway, the passive ones will not reach the ready state.
#readinessProbe:
# httpGet:
# path: /readyz
# port: healthz
env:
- name: NODE_NAME
valueFrom:
Expand Down Expand Up @@ -117,6 +125,14 @@ spec:
- containerPort: 8084
name: wg-metrics
{{- end }}
- containerPort: 8085
name: healthz
# ATTENTION: uncomment the readinessProbe section if you are aware of the consequences.
# If you have more replicas of the same gateway, the passive ones will not reach the ready state.
#readinessProbe:
# httpGet:
# path: /readyz
# port: healthz
securityContext:
capabilities:
add:
Expand Down Expand Up @@ -155,6 +171,14 @@ spec:
- containerPort: 8086
name: gv-metrics
{{- end }}
- containerPort: 8087
name: healthz
# ATTENTION: uncomment the readinessProbe section if you are aware of the consequences.
# If you have more replicas of the same gateway, the passive ones will not reach the ready state.
#readinessProbe:
# httpGet:
# path: /readyz
# port: healthz
env:
- name: NODE_NAME
valueFrom:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,15 @@ spec:
- containerPort: 8082
name: gw-metrics
{{- end }}
ports:
- containerPort: 8083
name: healthz
# ATTENTION: uncomment the readinessProbe section if you are aware of the consequences.
# If you have more replicas of the same gateway, the passive ones will not reach the ready state.
#readinessProbe:
# httpGet:
# path: /readyz
# port: healthz
env:
- name: NODE_NAME
valueFrom:
Expand Down Expand Up @@ -143,6 +152,15 @@ spec:
- containerPort: 8084
name: wg-metrics
{{- end }}
ports:
- containerPort: 8085
name: healthz
# ATTENTION: uncomment the readinessProbe section if you are aware of the consequences.
# If you have more replicas of the same gateway, the passive ones will not reach the ready state.
#readinessProbe:
# httpGet:
# path: /readyz
# port: healthz
securityContext:
capabilities:
add:
Expand Down Expand Up @@ -172,7 +190,7 @@ spec:
{{- if .Values.metrics.enabled }}
- --metrics-address=:8086
{{- end }}
- --health-probe-bind-address=:8086
- --health-probe-bind-address=:8087
volumeMounts:
- name: ipc
mountPath: /ipc
Expand All @@ -181,6 +199,15 @@ spec:
- containerPort: 8086
name: gv-metrics
{{- end }}
ports:
- containerPort: 8087
name: healthz
# ATTENTION: uncomment the readinessProbe section if you are aware of the consequences.
# If you have more replicas of the same gateway, the passive ones will not reach the ready state.
#readinessProbe:
# httpGet:
# path: /readyz
# port: healthz
env:
- name: NODE_NAME
valueFrom:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ spec:
{{- include "liqo.concatenateMap" $d | nindent 16 }}
{{- end }}
{{- if .Values.metrics.enabled }}
- --metrics-address=:8084
- --metrics-address=:8082
{{- end }}
- --health-probe-bind-address=:8085
- --health-probe-bind-address=:8083
- --ping-enabled=true
- --ping-loss-threshold={{ .Values.networking.gatewayTemplates.ping.lossThreshold }}
- --ping-interval={{ .Values.networking.gatewayTemplates.ping.interval }}
Expand All @@ -96,9 +96,17 @@ spec:
mountPath: /ipc
{{- if .Values.metrics.enabled }}
ports:
- containerPort: 8084
- containerPort: 8082
name: gw-metrics
{{- end }}
- containerPort: 8083
name: healthz
# ATTENTION: uncomment the readinessProbe section if you are aware of the consequences.
# If you have more replicas of the same gateway, the passive ones will not reach the ready state.
#readinessProbe:
# httpGet:
# path: /readyz
# port: healthz
env:
- name: NODE_NAME
valueFrom:
Expand Down Expand Up @@ -127,15 +135,23 @@ spec:
- --mtu={{"{{ .Spec.MTU }}"}}
- --listen-port={{"{{ .Spec.Endpoint.Port }}"}}
{{- if .Values.metrics.enabled }}
- --metrics-address=:8082
- --metrics-address=:8084
{{- end }}
- --health-probe-bind-address=:8083
- --health-probe-bind-address=:8085
- --implementation={{ .Values.networking.gatewayTemplates.wireguard.implementation }}
{{- if .Values.metrics.enabled }}
ports:
- containerPort: 8082
- containerPort: 8084
name: wg-metrics
{{- end }}
- containerPort: 8085
name: healthz
# ATTENTION: uncomment the readinessProbe section if you are aware of the consequences.
# If you have more replicas of the same gateway, the passive ones will not reach the ready state.
#readinessProbe:
# httpGet:
# path: /readyz
# port: healthz
securityContext:
capabilities:
add:
Expand Down Expand Up @@ -174,6 +190,14 @@ spec:
- containerPort: 8086
name: gv-metrics
{{- end }}
- containerPort: 8087
name: healthz
# ATTENTION: uncomment the readinessProbe section if you are aware of the consequences.
# If you have more replicas of the same gateway, the passive ones will not reach the ready state.
#readinessProbe:
# httpGet:
# path: /readyz
# port: healthz
env:
- name: NODE_NAME
valueFrom:
Expand Down
6 changes: 6 additions & 0 deletions deployments/liqo/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@ networking:
# In some cases (like K3S), this monitor can cause a huge amount of CPU usage.
# If you are experiencing high CPU usage, you can disable this feature.
nftablesMonitor: true
# -- Set the port where the fabric pod will expose the health probe.
# To disable the health probe, set the port to 0.
healthProbeBindAddressPort: "8081"
# -- Set the port where the fabric pod will expose the metrics.
# To disable the metrics, set the port to 0.
metricsAddressPort: "8082"

authentication:
# -- Enable/Disable the authentication module.
Expand Down
2 changes: 1 addition & 1 deletion pkg/gateway/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func InitFlags(flagset *pflag.FlagSet, opts *Options) {
"RetryPeriod for the leader election")

flagset.StringVar(&opts.MetricsAddress, FlagNameMetricsAddress.String(), "0", "Address for the metrics endpoint")
flagset.StringVar(&opts.ProbeAddr, FlagNameProbeAddr.String(), ":8081", "Address for the health probe endpoint")
flagset.StringVar(&opts.ProbeAddr, FlagNameProbeAddr.String(), "0", "Address for the health probe endpoint")

flagset.BoolVar(&opts.DisableKernelVersionCheck, FlagNameDisableKernelVersionCheck.String(), false, "Disable the kernel version check")
flagset.Var(&opts.MinimumKernelVersion, FlagNameMinimumKernelVersion.String(), "Minimum kernel version required by Liqo")
Expand Down

0 comments on commit 57c0926

Please sign in to comment.