From f7f7225e29365583a9fea91d278e6a849dfa24ec Mon Sep 17 00:00:00 2001 From: aggarwalakshun Date: Tue, 9 Dec 2025 21:16:58 +0530 Subject: [PATCH] add nvidia-gpu cluster policy --- .../helm/gpu-operator/gpu-operator-policy.yml | 292 ++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 clusters/default/helm/gpu-operator/gpu-operator-policy.yml diff --git a/clusters/default/helm/gpu-operator/gpu-operator-policy.yml b/clusters/default/helm/gpu-operator/gpu-operator-policy.yml new file mode 100644 index 0000000..3050339 --- /dev/null +++ b/clusters/default/helm/gpu-operator/gpu-operator-policy.yml @@ -0,0 +1,292 @@ +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + annotations: + meta.helm.sh/release-name: gpu-operator + meta.helm.sh/release-namespace: gpu-operator + generation: 2 + labels: + app.kubernetes.io/component: gpu-operator + app.kubernetes.io/instance: gpu-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: gpu-operator + app.kubernetes.io/version: v25.3.2 + helm.sh/chart: gpu-operator-v25.3.2 + helm.toolkit.fluxcd.io/name: gpu-operator + helm.toolkit.fluxcd.io/namespace: gpu-operator + name: cluster-policy +spec: + ccManager: + defaultMode: "off" + enabled: false + env: [] + image: k8s-cc-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.1.1 + cdi: + default: false + enabled: false + daemonsets: + labels: + app.kubernetes.io/managed-by: gpu-operator + helm.sh/chart: gpu-operator-v25.3.2 + priorityClassName: system-node-critical + rollingUpdate: + maxUnavailable: "1" + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + updateStrategy: RollingUpdate + dcgm: + enabled: false + image: dcgm + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: 4.2.3-1-ubuntu22.04 + dcgmExporter: + enabled: true + env: + - name: DCGM_EXPORTER_LISTEN + value: :9400 + - name: DCGM_EXPORTER_KUBERNETES + value: "true" + - name: DCGM_EXPORTER_COLLECTORS + value: /etc/dcgm-exporter/dcp-metrics-included.csv + image: dcgm-exporter + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/k8s + service: + internalTrafficPolicy: Cluster + serviceMonitor: + additionalLabels: {} + enabled: false + honorLabels: false + interval: 15s + relabelings: [] + version: 4.2.3-4.1.3-ubuntu22.04 + devicePlugin: + config: + default: any + name: time-slicing-config + enabled: true + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + image: k8s-device-plugin + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: v0.17.3 + driver: + certConfig: + name: "" + enabled: false + image: driver + imagePullPolicy: IfNotPresent + kernelModuleConfig: + name: "" + kernelModuleType: auto + licensingConfig: + configMapName: "" + nlsEnabled: true + manager: + env: + - name: ENABLE_GPU_POD_EVICTION + value: "true" + - name: ENABLE_AUTO_DRAIN + value: "false" + - name: DRAIN_USE_FORCE + value: "false" + - name: DRAIN_POD_SELECTOR_LABEL + value: "" + - name: DRAIN_TIMEOUT_SECONDS + value: 0s + - name: DRAIN_DELETE_EMPTYDIR_DATA + value: "false" + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.8.0 + rdma: + enabled: false + useHostMofed: false + repoConfig: + configMapName: "" + repository: nvcr.io/nvidia + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 60 + upgradePolicy: + autoUpgrade: true + drain: + deleteEmptyDir: false + enable: false + force: false + timeoutSeconds: 300 + maxParallelUpgrades: 1 + maxUnavailable: 25% + podDeletion: + deleteEmptyDir: false + force: false + timeoutSeconds: 300 + waitForCompletion: + timeoutSeconds: 0 + useNvidiaDriverCRD: false + usePrecompiled: false + version: 570.148.08 + virtualTopology: + config: "" + gdrcopy: + enabled: false + image: gdrdrv + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v2.5 + gfd: + enabled: true + env: + - name: GFD_SLEEP_INTERVAL + value: 60s + - name: GFD_FAIL_ON_INIT_ERROR + value: "true" + image: k8s-device-plugin + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: v0.17.3 + hostPaths: + driverInstallDir: /run/nvidia/driver + rootFS: / + kataManager: + config: + artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses + runtimeClasses: + - artifacts: + pullSecret: "" + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03 + name: kata-nvidia-gpu + nodeSelector: {} + - artifacts: + pullSecret: "" + url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp + name: kata-nvidia-gpu-snp + nodeSelector: + nvidia.com/cc.capable: "true" + enabled: false + image: k8s-kata-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.2.3 + mig: + strategy: single + migManager: + config: + default: all-disabled + name: default-mig-parted-config + enabled: true + env: + - name: WITH_REBOOT + value: "false" + gpuClientsConfig: + name: "" + image: k8s-mig-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.12.2-ubuntu20.04 + nodeStatusExporter: + enabled: false + image: gpu-operator-validator + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v25.3.2 + operator: + defaultRuntime: docker + initContainer: + image: cuda + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: 12.8.1-base-ubi9 + runtimeClass: nvidia + psa: + enabled: false + sandboxDevicePlugin: + enabled: true + image: kubevirt-gpu-device-plugin + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: v1.3.1 + sandboxWorkloads: + defaultWorkload: container + enabled: false + toolkit: + enabled: true + env: + - name: CONTAINERD_SOCKET + value: /run/k3s/containerd/containerd.sock + - name: CONTAINERD_CONFIG + value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml + image: container-toolkit + imagePullPolicy: IfNotPresent + installDir: /usr/local/nvidia + repository: nvcr.io/nvidia/k8s + version: v1.17.8-ubuntu20.04 + validator: + image: gpu-operator-validator + imagePullPolicy: IfNotPresent + plugin: + env: + - name: WITH_WORKLOAD + value: "false" + repository: nvcr.io/nvidia/cloud-native + version: v25.3.2 + vfioManager: + driverManager: + env: + - name: ENABLE_GPU_POD_EVICTION + value: "false" + - name: ENABLE_AUTO_DRAIN + value: "false" + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.8.0 + enabled: true + image: cuda + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia + version: 12.8.1-base-ubi9 + vgpuDeviceManager: + config: + default: default + name: "" + enabled: true + image: vgpu-device-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.3.0 + vgpuManager: + driverManager: + env: + - name: ENABLE_GPU_POD_EVICTION + value: "false" + - name: ENABLE_AUTO_DRAIN + value: "false" + image: k8s-driver-manager + imagePullPolicy: IfNotPresent + repository: nvcr.io/nvidia/cloud-native + version: v0.8.0 + enabled: false + image: vgpu-manager + imagePullPolicy: IfNotPresent