add nvidia-gpu cluster policy

This commit is contained in:
2025-12-09 21:16:58 +05:30
parent 81ca4b95bd
commit f7f7225e29

View File

@@ -0,0 +1,292 @@
apiVersion: nvidia.com/v1
kind: ClusterPolicy
metadata:
annotations:
meta.helm.sh/release-name: gpu-operator
meta.helm.sh/release-namespace: gpu-operator
generation: 2
labels:
app.kubernetes.io/component: gpu-operator
app.kubernetes.io/instance: gpu-operator
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: gpu-operator
app.kubernetes.io/version: v25.3.2
helm.sh/chart: gpu-operator-v25.3.2
helm.toolkit.fluxcd.io/name: gpu-operator
helm.toolkit.fluxcd.io/namespace: gpu-operator
name: cluster-policy
spec:
ccManager:
defaultMode: "off"
enabled: false
env: []
image: k8s-cc-manager
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia/cloud-native
version: v0.1.1
cdi:
default: false
enabled: false
daemonsets:
labels:
app.kubernetes.io/managed-by: gpu-operator
helm.sh/chart: gpu-operator-v25.3.2
priorityClassName: system-node-critical
rollingUpdate:
maxUnavailable: "1"
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
updateStrategy: RollingUpdate
dcgm:
enabled: false
image: dcgm
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia/cloud-native
version: 4.2.3-1-ubuntu22.04
dcgmExporter:
enabled: true
env:
- name: DCGM_EXPORTER_LISTEN
value: :9400
- name: DCGM_EXPORTER_KUBERNETES
value: "true"
- name: DCGM_EXPORTER_COLLECTORS
value: /etc/dcgm-exporter/dcp-metrics-included.csv
image: dcgm-exporter
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia/k8s
service:
internalTrafficPolicy: Cluster
serviceMonitor:
additionalLabels: {}
enabled: false
honorLabels: false
interval: 15s
relabelings: []
version: 4.2.3-4.1.3-ubuntu22.04
devicePlugin:
config:
default: any
name: time-slicing-config
enabled: true
env:
- name: PASS_DEVICE_SPECS
value: "true"
- name: FAIL_ON_INIT_ERROR
value: "true"
- name: DEVICE_LIST_STRATEGY
value: envvar
- name: DEVICE_ID_STRATEGY
value: uuid
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: all
image: k8s-device-plugin
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia
version: v0.17.3
driver:
certConfig:
name: ""
enabled: false
image: driver
imagePullPolicy: IfNotPresent
kernelModuleConfig:
name: ""
kernelModuleType: auto
licensingConfig:
configMapName: ""
nlsEnabled: true
manager:
env:
- name: ENABLE_GPU_POD_EVICTION
value: "true"
- name: ENABLE_AUTO_DRAIN
value: "false"
- name: DRAIN_USE_FORCE
value: "false"
- name: DRAIN_POD_SELECTOR_LABEL
value: ""
- name: DRAIN_TIMEOUT_SECONDS
value: 0s
- name: DRAIN_DELETE_EMPTYDIR_DATA
value: "false"
image: k8s-driver-manager
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia/cloud-native
version: v0.8.0
rdma:
enabled: false
useHostMofed: false
repoConfig:
configMapName: ""
repository: nvcr.io/nvidia
startupProbe:
failureThreshold: 120
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 60
upgradePolicy:
autoUpgrade: true
drain:
deleteEmptyDir: false
enable: false
force: false
timeoutSeconds: 300
maxParallelUpgrades: 1
maxUnavailable: 25%
podDeletion:
deleteEmptyDir: false
force: false
timeoutSeconds: 300
waitForCompletion:
timeoutSeconds: 0
useNvidiaDriverCRD: false
usePrecompiled: false
version: 570.148.08
virtualTopology:
config: ""
gdrcopy:
enabled: false
image: gdrdrv
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia/cloud-native
version: v2.5
gfd:
enabled: true
env:
- name: GFD_SLEEP_INTERVAL
value: 60s
- name: GFD_FAIL_ON_INIT_ERROR
value: "true"
image: k8s-device-plugin
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia
version: v0.17.3
hostPaths:
driverInstallDir: /run/nvidia/driver
rootFS: /
kataManager:
config:
artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses
runtimeClasses:
- artifacts:
pullSecret: ""
url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.54.03
name: kata-nvidia-gpu
nodeSelector: {}
- artifacts:
pullSecret: ""
url: nvcr.io/nvidia/cloud-native/kata-gpu-artifacts:ubuntu22.04-535.86.10-snp
name: kata-nvidia-gpu-snp
nodeSelector:
nvidia.com/cc.capable: "true"
enabled: false
image: k8s-kata-manager
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia/cloud-native
version: v0.2.3
mig:
strategy: single
migManager:
config:
default: all-disabled
name: default-mig-parted-config
enabled: true
env:
- name: WITH_REBOOT
value: "false"
gpuClientsConfig:
name: ""
image: k8s-mig-manager
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia/cloud-native
version: v0.12.2-ubuntu20.04
nodeStatusExporter:
enabled: false
image: gpu-operator-validator
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia/cloud-native
version: v25.3.2
operator:
defaultRuntime: docker
initContainer:
image: cuda
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia
version: 12.8.1-base-ubi9
runtimeClass: nvidia
psa:
enabled: false
sandboxDevicePlugin:
enabled: true
image: kubevirt-gpu-device-plugin
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia
version: v1.3.1
sandboxWorkloads:
defaultWorkload: container
enabled: false
toolkit:
enabled: true
env:
- name: CONTAINERD_SOCKET
value: /run/k3s/containerd/containerd.sock
- name: CONTAINERD_CONFIG
value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml
image: container-toolkit
imagePullPolicy: IfNotPresent
installDir: /usr/local/nvidia
repository: nvcr.io/nvidia/k8s
version: v1.17.8-ubuntu20.04
validator:
image: gpu-operator-validator
imagePullPolicy: IfNotPresent
plugin:
env:
- name: WITH_WORKLOAD
value: "false"
repository: nvcr.io/nvidia/cloud-native
version: v25.3.2
vfioManager:
driverManager:
env:
- name: ENABLE_GPU_POD_EVICTION
value: "false"
- name: ENABLE_AUTO_DRAIN
value: "false"
image: k8s-driver-manager
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia/cloud-native
version: v0.8.0
enabled: true
image: cuda
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia
version: 12.8.1-base-ubi9
vgpuDeviceManager:
config:
default: default
name: ""
enabled: true
image: vgpu-device-manager
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia/cloud-native
version: v0.3.0
vgpuManager:
driverManager:
env:
- name: ENABLE_GPU_POD_EVICTION
value: "false"
- name: ENABLE_AUTO_DRAIN
value: "false"
image: k8s-driver-manager
imagePullPolicy: IfNotPresent
repository: nvcr.io/nvidia/cloud-native
version: v0.8.0
enabled: false
image: vgpu-manager
imagePullPolicy: IfNotPresent