diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 07e424761..d987a6521 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -482,12 +482,20 @@ type DriverSpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" UsePrecompiled *bool `json:"usePrecompiled,omitempty"` + // Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. // UseOpenKernelModules indicates if the open GPU kernel modules should be used // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable use of open GPU kernel modules" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" UseOpenKernelModules *bool `json:"useOpenKernelModules,omitempty"` + // KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + // Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + // type is chosen based on the GPU devices on the host and the driver branch used + // +kubebuilder:validation:Enum=auto;open;proprietary + // +kubebuilder:default=auto + KernelModuleType string `json:"kernelModuleType,omitempty"` + // Enabled indicates if deployment of NVIDIA Driver through operator is enabled // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA Driver deployment through GPU Operator" @@ -1856,11 +1864,7 @@ func (d *DriverSpec) UsePrecompiledDrivers() bool { // OpenKernelModulesEnabled returns true if driver install is enabled using open GPU kernel modules func (d *DriverSpec) OpenKernelModulesEnabled() bool { - if d.UseOpenKernelModules == nil { - // default is false if not specified by user - return false - } - return *d.UseOpenKernelModules + return d.KernelModuleType == "open" } // IsEnabled returns true if device-plugin is enabled(default) through gpu-operator diff --git a/api/nvidia/v1alpha1/nvidiadriver_types.go b/api/nvidia/v1alpha1/nvidiadriver_types.go index 86bae0b48..f45d69242 100644 --- a/api/nvidia/v1alpha1/nvidiadriver_types.go +++ b/api/nvidia/v1alpha1/nvidiadriver_types.go @@ -53,12 +53,20 @@ type NVIDIADriverSpec struct { // +kubebuilder:validation:XValidation:rule="self == oldSelf",message="usePrecompiled is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting." UsePrecompiled *bool `json:"usePrecompiled,omitempty"` + // Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. // UseOpenKernelModules indicates if the open GPU kernel modules should be used // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable use of open GPU kernel modules" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" UseOpenKernelModules *bool `json:"useOpenKernelModules,omitempty"` + // KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + // Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + // type is chosen based on the GPU devices on the host and the driver branch used + // +kubebuilder:validation:Enum=auto;open;proprietary + // +kubebuilder:default=auto + KernelModuleType string `json:"kernelModuleType,omitempty"` + // NVIDIA Driver container startup probe settings StartupProbe *ContainerProbeSpec `json:"startupProbe,omitempty"` @@ -642,10 +650,7 @@ func (d *NVIDIADriverSpec) IsGDRCopyEnabled() bool { // IsOpenKernelModulesEnabled returns true if NVIDIA OpenRM drivers are enabled func (d *NVIDIADriverSpec) IsOpenKernelModulesEnabled() bool { - if d.UseOpenKernelModules == nil || !*d.UseOpenKernelModules { - return false - } - return true + return d.KernelModuleType == "open" } // IsOpenKernelModulesRequired returns true if NVIDIA OpenRM drivers required in this configuration diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 54e4a652b..02ea9cd28 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -650,6 +650,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string licensingConfig: description: 'Optional: Licensing configuration for NVIDIA vGPU licensing' @@ -978,8 +989,9 @@ spec: NVIDIA Driver is managed by the NVIDIADriver CRD type type: boolean useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA diff --git a/bundle/manifests/nvidia.com_nvidiadrivers.yaml b/bundle/manifests/nvidia.com_nvidiadrivers.yaml index c49059a38..b2fca2b39 100644 --- a/bundle/manifests/nvidia.com_nvidiadrivers.yaml +++ b/bundle/manifests/nvidia.com_nvidiadrivers.yaml @@ -211,6 +211,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string labels: additionalProperties: type: string @@ -684,8 +695,9 @@ spec: type: object type: array useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA Driver diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 54e4a652b..02ea9cd28 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -650,6 +650,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string licensingConfig: description: 'Optional: Licensing configuration for NVIDIA vGPU licensing' @@ -978,8 +989,9 @@ spec: NVIDIA Driver is managed by the NVIDIADriver CRD type type: boolean useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA diff --git a/config/crd/bases/nvidia.com_nvidiadrivers.yaml b/config/crd/bases/nvidia.com_nvidiadrivers.yaml index c49059a38..b2fca2b39 100644 --- a/config/crd/bases/nvidia.com_nvidiadrivers.yaml +++ b/config/crd/bases/nvidia.com_nvidiadrivers.yaml @@ -211,6 +211,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string labels: additionalProperties: type: string @@ -684,8 +695,9 @@ spec: type: object type: array useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA Driver diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 1b1801391..f52de886a 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -165,6 +165,8 @@ const ( DefaultCCModeEnvName = "DEFAULT_CC_MODE" // OpenKernelModulesEnabledEnvName is the name of the driver-container envvar for enabling open GPU kernel module support OpenKernelModulesEnabledEnvName = "OPEN_KERNEL_MODULES_ENABLED" + // KernelModuleTypeEnvName is the name of the driver-container envvar to set the desired kernel module type + KernelModuleTypeEnvName = "KERNEL_MODULE_TYPE" // MPSRootEnvName is the name of the envvar for configuring the MPS root MPSRootEnvName = "MPS_ROOT" // DefaultMPSRoot is the default MPS root path on the host @@ -3166,8 +3168,13 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy setContainerEnv(driverContainer, env.Name, env.Value) } } - if config.Driver.OpenKernelModulesEnabled() { - setContainerEnv(driverContainer, OpenKernelModulesEnabledEnvName, "true") + + if len(config.Driver.KernelModuleType) > 0 { + setContainerEnv(driverContainer, KernelModuleTypeEnvName, config.Driver.KernelModuleType) + // we set the "OPEN_KERNEL_MODULES_ENABLED" envar for backwards compatibility with older driver containers + if config.Driver.OpenKernelModulesEnabled() { + setContainerEnv(driverContainer, OpenKernelModulesEnabledEnvName, "true") + } } // set container probe timeouts diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 54e4a652b..02ea9cd28 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -650,6 +650,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string licensingConfig: description: 'Optional: Licensing configuration for NVIDIA vGPU licensing' @@ -978,8 +989,9 @@ spec: NVIDIA Driver is managed by the NVIDIADriver CRD type type: boolean useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA diff --git a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml index c49059a38..b2fca2b39 100644 --- a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml @@ -211,6 +211,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string labels: additionalProperties: type: string @@ -684,8 +695,9 @@ spec: type: object type: array useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA Driver diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index af9e87c38..0a753c4a8 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -143,7 +143,7 @@ spec: driver: enabled: {{ .Values.driver.enabled }} useNvidiaDriverCRD: {{ .Values.driver.nvidiaDriverCRD.enabled }} - useOpenKernelModules: {{ .Values.driver.useOpenKernelModules }} + kernelModuleType: {{ .Values.driver.kernelModuleType }} usePrecompiled: {{ .Values.driver.usePrecompiled }} {{- if .Values.driver.repository }} repository: {{ .Values.driver.repository }} diff --git a/deployments/gpu-operator/templates/nvidiadriver.yaml b/deployments/gpu-operator/templates/nvidiadriver.yaml index 31660c025..cbe567135 100644 --- a/deployments/gpu-operator/templates/nvidiadriver.yaml +++ b/deployments/gpu-operator/templates/nvidiadriver.yaml @@ -7,7 +7,7 @@ spec: repository: {{ .Values.driver.repository }} image: {{ .Values.driver.image }} version: {{ .Values.driver.version }} - useOpenKernelModules: {{ .Values.driver.useOpenKernelModules }} + kernelModuleType: {{ .Values.driver.kernelModuleType }} usePrecompiled: {{ .Values.driver.usePrecompiled }} driverType: {{ .Values.driver.nvidiaDriverCRD.driverType | default "gpu" }} {{- if .Values.daemonsets.annotations }} diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 5e404f081..e0daee03c 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -137,7 +137,11 @@ driver: deployDefaultCR: true driverType: gpu nodeSelector: {} - useOpenKernelModules: false + kernelModuleType: "auto" + + # NOTE: useOpenKernelModules has been deprecated and made no-op. Please use kernelModuleType instead. + # useOpenKernelModules: false + # use pre-compiled packages for NVIDIA driver installation. # only supported for as a tech-preview feature on ubuntu22.04 kernels. usePrecompiled: false diff --git a/internal/state/driver_test.go b/internal/state/driver_test.go index a591fa9d3..3f9287bee 100644 --- a/internal/state/driver_test.go +++ b/internal/state/driver_test.go @@ -221,8 +221,8 @@ func TestDriverSpec(t *testing.T) { Effect: "NoSchedule", }, }, - PriorityClassName: "custom-priority-class-name", - UseOpenKernelModules: utils.BoolPtr(true), + PriorityClassName: "custom-priority-class-name", + KernelModuleType: "open", } driverSpec.Labels = sanitizeDriverLabels(driverSpec.Labels) diff --git a/internal/state/testdata/golden/driver-full-spec.yaml b/internal/state/testdata/golden/driver-full-spec.yaml index 60065333e..87c6803b8 100644 --- a/internal/state/testdata/golden/driver-full-spec.yaml +++ b/internal/state/testdata/golden/driver-full-spec.yaml @@ -146,6 +146,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: KERNEL_MODULE_TYPE + value: open - name: OPEN_KERNEL_MODULES_ENABLED value: "true" - name: FOO diff --git a/manifests/state-driver/0500_daemonset.yaml b/manifests/state-driver/0500_daemonset.yaml index 26dfecf15..643d8a57f 100644 --- a/manifests/state-driver/0500_daemonset.yaml +++ b/manifests/state-driver/0500_daemonset.yaml @@ -213,9 +213,14 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP - {{- if deref .Driver.Spec.UseOpenKernelModules }} + {{- if .Driver.Spec.KernelModuleType }} + - name: KERNEL_MODULE_TYPE + value: {{ .Driver.Spec.KernelModuleType }} + # we set this env var for backwards compatibility with older driver versions + {{- if eq .Driver.Spec.KernelModuleType "open"}} - name: OPEN_KERNEL_MODULES_ENABLED value: "true" + {{- end }} {{- end }} {{- if and (.Openshift) (.Runtime.OpenshiftVersion) }} - name: OPENSHIFT_VERSION