diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 07e424761..2ca4faa36 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -24,11 +24,8 @@ import ( kata_v1alpha1 "github.com/NVIDIA/k8s-kata-manager/api/v1alpha1/config" upgrade_v1alpha1 "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - "golang.org/x/mod/semver" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - "github.com/NVIDIA/gpu-operator/internal/consts" ) // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! @@ -482,12 +479,20 @@ type DriverSpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" UsePrecompiled *bool `json:"usePrecompiled,omitempty"` + // Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. // UseOpenKernelModules indicates if the open GPU kernel modules should be used // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable use of open GPU kernel modules" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" UseOpenKernelModules *bool `json:"useOpenKernelModules,omitempty"` + // KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + // Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + // type is chosen based on the GPU devices on the host and the driver branch used + // +kubebuilder:validation:Enum=auto;open;proprietary + // +kubebuilder:default=auto + KernelModuleType string `json:"kernelModuleType,omitempty"` + // Enabled indicates if deployment of NVIDIA Driver through operator is enabled // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA Driver deployment through GPU Operator" @@ -1856,11 +1861,7 @@ func (d *DriverSpec) UsePrecompiledDrivers() bool { // OpenKernelModulesEnabled returns true if driver install is enabled using open GPU kernel modules func (d *DriverSpec) OpenKernelModulesEnabled() bool { - if d.UseOpenKernelModules == nil { - // default is false if not specified by user - return false - } - return *d.UseOpenKernelModules + return d.KernelModuleType == "open" } // IsEnabled returns true if device-plugin is enabled(default) through gpu-operator @@ -2000,28 +2001,6 @@ func (gds *GPUDirectStorageSpec) IsEnabled() bool { return *gds.Enabled } -// IsOpenKernelModulesRequired returns true if NVIDIA OpenRM drivers required in this configuration -func (gds *GPUDirectStorageSpec) IsOpenKernelModulesRequired() bool { - // Add constraints here which require OpenRM drivers - if !gds.IsEnabled() { - return false - } - - // If image digest is provided instead of the version, assume that OpenRM driver is required - if strings.HasPrefix(gds.Version, "sha256") { - return true - } - - gdsVersion := gds.Version - if !strings.HasPrefix(gdsVersion, "v") { - gdsVersion = fmt.Sprintf("v%s", gdsVersion) - } - if semver.Compare(gdsVersion, consts.MinimumGDSVersionForOpenRM) >= 0 { - return true - } - return false -} - // IsEnabled returns true if GDRCopy is enabled through gpu-operator func (gdrcopy *GDRCopySpec) IsEnabled() bool { if gdrcopy.Enabled == nil { diff --git a/api/nvidia/v1alpha1/nvidiadriver_types.go b/api/nvidia/v1alpha1/nvidiadriver_types.go index 86bae0b48..f45d69242 100644 --- a/api/nvidia/v1alpha1/nvidiadriver_types.go +++ b/api/nvidia/v1alpha1/nvidiadriver_types.go @@ -53,12 +53,20 @@ type NVIDIADriverSpec struct { // +kubebuilder:validation:XValidation:rule="self == oldSelf",message="usePrecompiled is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting." UsePrecompiled *bool `json:"usePrecompiled,omitempty"` + // Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. // UseOpenKernelModules indicates if the open GPU kernel modules should be used // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable use of open GPU kernel modules" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" UseOpenKernelModules *bool `json:"useOpenKernelModules,omitempty"` + // KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + // Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + // type is chosen based on the GPU devices on the host and the driver branch used + // +kubebuilder:validation:Enum=auto;open;proprietary + // +kubebuilder:default=auto + KernelModuleType string `json:"kernelModuleType,omitempty"` + // NVIDIA Driver container startup probe settings StartupProbe *ContainerProbeSpec `json:"startupProbe,omitempty"` @@ -642,10 +650,7 @@ func (d *NVIDIADriverSpec) IsGDRCopyEnabled() bool { // IsOpenKernelModulesEnabled returns true if NVIDIA OpenRM drivers are enabled func (d *NVIDIADriverSpec) IsOpenKernelModulesEnabled() bool { - if d.UseOpenKernelModules == nil || !*d.UseOpenKernelModules { - return false - } - return true + return d.KernelModuleType == "open" } // IsOpenKernelModulesRequired returns true if NVIDIA OpenRM drivers required in this configuration diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml index 552ef3a51..5e699030e 100644 --- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml +++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -41,7 +41,6 @@ metadata: "driver": { "enabled": true, "useNvidiaDriverCRD": false, - "useOpenKernelModules": false, "upgradePolicy": { "autoUpgrade": true, "drain": { @@ -50,6 +49,7 @@ metadata: "force": false, "timeoutSeconds": 300 }, + "kernelModuleType": "auto", "maxParallelUpgrades": 1, "maxUnavailable": "25%", "podDeletion": { diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 54e4a652b..02ea9cd28 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -650,6 +650,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string licensingConfig: description: 'Optional: Licensing configuration for NVIDIA vGPU licensing' @@ -978,8 +989,9 @@ spec: NVIDIA Driver is managed by the NVIDIADriver CRD type type: boolean useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA diff --git a/bundle/manifests/nvidia.com_nvidiadrivers.yaml b/bundle/manifests/nvidia.com_nvidiadrivers.yaml index c49059a38..b2fca2b39 100644 --- a/bundle/manifests/nvidia.com_nvidiadrivers.yaml +++ b/bundle/manifests/nvidia.com_nvidiadrivers.yaml @@ -211,6 +211,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string labels: additionalProperties: type: string @@ -684,8 +695,9 @@ spec: type: object type: array useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA Driver diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 54e4a652b..02ea9cd28 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -650,6 +650,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string licensingConfig: description: 'Optional: Licensing configuration for NVIDIA vGPU licensing' @@ -978,8 +989,9 @@ spec: NVIDIA Driver is managed by the NVIDIADriver CRD type type: boolean useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA diff --git a/config/crd/bases/nvidia.com_nvidiadrivers.yaml b/config/crd/bases/nvidia.com_nvidiadrivers.yaml index c49059a38..b2fca2b39 100644 --- a/config/crd/bases/nvidia.com_nvidiadrivers.yaml +++ b/config/crd/bases/nvidia.com_nvidiadrivers.yaml @@ -211,6 +211,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string labels: additionalProperties: type: string @@ -684,8 +695,9 @@ spec: type: object type: array useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA Driver diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 1b1801391..02c4876bd 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -165,6 +165,8 @@ const ( DefaultCCModeEnvName = "DEFAULT_CC_MODE" // OpenKernelModulesEnabledEnvName is the name of the driver-container envvar for enabling open GPU kernel module support OpenKernelModulesEnabledEnvName = "OPEN_KERNEL_MODULES_ENABLED" + // KernelModuleTypeEnvName is the name of the driver-container envvar to set the desired kernel module type + KernelModuleTypeEnvName = "KERNEL_MODULE_TYPE" // MPSRootEnvName is the name of the envvar for configuring the MPS root MPSRootEnvName = "MPS_ROOT" // DefaultMPSRoot is the default MPS root path on the host @@ -2664,9 +2666,6 @@ func transformGDSContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe if config.Driver.UsePrecompiledDrivers() { return fmt.Errorf("GPUDirect Storage driver (nvidia-fs) is not supported along with pre-compiled NVIDIA drivers") } - if config.GPUDirectStorage.IsOpenKernelModulesRequired() && !config.Driver.OpenKernelModulesEnabled() { - return fmt.Errorf("GPUDirect Storage driver '%s' is only supported with NVIDIA OpenRM drivers. Please set 'driver.useOpenKernelModules=true' in ClusterPolicy to enable OpenRM mode", config.GPUDirectStorage.Version) - } gdsContainer := &obj.Spec.Template.Spec.Containers[i] @@ -3166,8 +3165,13 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy setContainerEnv(driverContainer, env.Name, env.Value) } } - if config.Driver.OpenKernelModulesEnabled() { - setContainerEnv(driverContainer, OpenKernelModulesEnabledEnvName, "true") + + if len(config.Driver.KernelModuleType) > 0 { + setContainerEnv(driverContainer, KernelModuleTypeEnvName, config.Driver.KernelModuleType) + // we set the "OPEN_KERNEL_MODULES_ENABLED" envar for backwards compatibility with older driver containers + if config.Driver.OpenKernelModulesEnabled() { + setContainerEnv(driverContainer, OpenKernelModulesEnabledEnvName, "true") + } } // set container probe timeouts diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index ae17a9f3d..ad50e9b6d 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -952,74 +952,6 @@ func TestSandboxDevicePluginAssets(t *testing.T) { } } -func TestIsOpenKernelModulesRequired(t *testing.T) { - enable := true - disable := false - testCases := []struct { - description string - gds *gpuv1.GPUDirectStorageSpec - output bool - }{ - { - "gds-disabled", - &gpuv1.GPUDirectStorageSpec{Enabled: &disable, Version: "v2.14.5"}, - false, - }, - { - "digest", - &gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "sha256:8d1ec78f2b1ddb7f0c47453d0427231190747bda411733a7dd0c8f5196f09e9c"}, - true, - }, - { - "lower", - &gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "v2.14.5"}, - false, - }, - { - "equal", - &gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "v2.17.5"}, - true, - }, - { - "greater", - &gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "v2.17.6"}, - true, - }, - { - "major-bump", - &gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "v3.1.0"}, - true, - }, - { - "non-semver", - &gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "2.14.5"}, - false, - }, - { - "non-semver-greater", - &gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "2.17.6"}, - true, - }, - { - "lower-beta", - &gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "2.14.6-beta"}, - false, - }, - { - "greater-beta", - &gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "2.17.6-beta"}, - true, - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - isOpenRMRequired := tc.gds.IsOpenKernelModulesRequired() - require.Equal(t, tc.output, isOpenRMRequired, "Incorrect status from IsOpenKernelModulesRequired() for GDS driver") - }) - } -} - // getDCGMExporterTestInput return a ClusterPolicy instance for a particular // dcgm-exporter test case. func getDCGMExporterTestInput(testCase string) *gpuv1.ClusterPolicy { diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 54e4a652b..02ea9cd28 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -650,6 +650,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string licensingConfig: description: 'Optional: Licensing configuration for NVIDIA vGPU licensing' @@ -978,8 +989,9 @@ spec: NVIDIA Driver is managed by the NVIDIADriver CRD type type: boolean useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA diff --git a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml index c49059a38..b2fca2b39 100644 --- a/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml @@ -211,6 +211,17 @@ spec: name: type: string type: object + kernelModuleType: + default: auto + description: |- + KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver. + Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module + type is chosen based on the GPU devices on the host and the driver branch used + enum: + - auto + - open + - proprietary + type: string labels: additionalProperties: type: string @@ -684,8 +695,9 @@ spec: type: object type: array useOpenKernelModules: - description: UseOpenKernelModules indicates if the open GPU kernel - modules should be used + description: |- + Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead. + UseOpenKernelModules indicates if the open GPU kernel modules should be used type: boolean usePrecompiled: description: UsePrecompiled indicates if deployment of NVIDIA Driver diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index af9e87c38..0a753c4a8 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -143,7 +143,7 @@ spec: driver: enabled: {{ .Values.driver.enabled }} useNvidiaDriverCRD: {{ .Values.driver.nvidiaDriverCRD.enabled }} - useOpenKernelModules: {{ .Values.driver.useOpenKernelModules }} + kernelModuleType: {{ .Values.driver.kernelModuleType }} usePrecompiled: {{ .Values.driver.usePrecompiled }} {{- if .Values.driver.repository }} repository: {{ .Values.driver.repository }} diff --git a/deployments/gpu-operator/templates/nvidiadriver.yaml b/deployments/gpu-operator/templates/nvidiadriver.yaml index 31660c025..cbe567135 100644 --- a/deployments/gpu-operator/templates/nvidiadriver.yaml +++ b/deployments/gpu-operator/templates/nvidiadriver.yaml @@ -7,7 +7,7 @@ spec: repository: {{ .Values.driver.repository }} image: {{ .Values.driver.image }} version: {{ .Values.driver.version }} - useOpenKernelModules: {{ .Values.driver.useOpenKernelModules }} + kernelModuleType: {{ .Values.driver.kernelModuleType }} usePrecompiled: {{ .Values.driver.usePrecompiled }} driverType: {{ .Values.driver.nvidiaDriverCRD.driverType | default "gpu" }} {{- if .Values.daemonsets.annotations }} diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 5e404f081..e0daee03c 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -137,7 +137,11 @@ driver: deployDefaultCR: true driverType: gpu nodeSelector: {} - useOpenKernelModules: false + kernelModuleType: "auto" + + # NOTE: useOpenKernelModules has been deprecated and made no-op. Please use kernelModuleType instead. + # useOpenKernelModules: false + # use pre-compiled packages for NVIDIA driver installation. # only supported for as a tech-preview feature on ubuntu22.04 kernels. usePrecompiled: false diff --git a/internal/state/driver_test.go b/internal/state/driver_test.go index a591fa9d3..3f9287bee 100644 --- a/internal/state/driver_test.go +++ b/internal/state/driver_test.go @@ -221,8 +221,8 @@ func TestDriverSpec(t *testing.T) { Effect: "NoSchedule", }, }, - PriorityClassName: "custom-priority-class-name", - UseOpenKernelModules: utils.BoolPtr(true), + PriorityClassName: "custom-priority-class-name", + KernelModuleType: "open", } driverSpec.Labels = sanitizeDriverLabels(driverSpec.Labels) diff --git a/internal/state/testdata/golden/driver-full-spec.yaml b/internal/state/testdata/golden/driver-full-spec.yaml index 60065333e..87c6803b8 100644 --- a/internal/state/testdata/golden/driver-full-spec.yaml +++ b/internal/state/testdata/golden/driver-full-spec.yaml @@ -146,6 +146,8 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: KERNEL_MODULE_TYPE + value: open - name: OPEN_KERNEL_MODULES_ENABLED value: "true" - name: FOO diff --git a/manifests/state-driver/0500_daemonset.yaml b/manifests/state-driver/0500_daemonset.yaml index 26dfecf15..643d8a57f 100644 --- a/manifests/state-driver/0500_daemonset.yaml +++ b/manifests/state-driver/0500_daemonset.yaml @@ -213,9 +213,14 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP - {{- if deref .Driver.Spec.UseOpenKernelModules }} + {{- if .Driver.Spec.KernelModuleType }} + - name: KERNEL_MODULE_TYPE + value: {{ .Driver.Spec.KernelModuleType }} + # we set this env var for backwards compatibility with older driver versions + {{- if eq .Driver.Spec.KernelModuleType "open"}} - name: OPEN_KERNEL_MODULES_ENABLED value: "true" + {{- end }} {{- end }} {{- if and (.Openshift) (.Runtime.OpenshiftVersion) }} - name: OPENSHIFT_VERSION