From 3cf940d273affb1024b62a5c196850d23bbac0ca Mon Sep 17 00:00:00 2001 From: aggarwalakshun Date: Sat, 25 Oct 2025 14:55:10 +0530 Subject: [PATCH] Add Nvidia GPU operator Helm configuration files --- .../gpu-operator/gpu-operator-configmap.yml | 18 +++++++++++ .../gpu-operator/gpu-operator-release.yml | 31 +++++++++++++++++++ .../helm/gpu-operator/gpu-operator-repo.yml | 9 ++++++ 3 files changed, 58 insertions(+) create mode 100644 clusters/default/helm/gpu-operator/gpu-operator-configmap.yml create mode 100644 clusters/default/helm/gpu-operator/gpu-operator-release.yml create mode 100644 clusters/default/helm/gpu-operator/gpu-operator-repo.yml diff --git a/clusters/default/helm/gpu-operator/gpu-operator-configmap.yml b/clusters/default/helm/gpu-operator/gpu-operator-configmap.yml new file mode 100644 index 0000000..7d89550 --- /dev/null +++ b/clusters/default/helm/gpu-operator/gpu-operator-configmap.yml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: time-slicing-config + namespace: gpu-operator +data: + any: |- + version: v1 + flags: + migStrategy: none + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 4 + +# remember to patch the cluster policy to use this configmap +# kubectl patch clusterpolicy/cluster-policy -n gpu-operator --type merge -p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "any"}}}}' diff --git a/clusters/default/helm/gpu-operator/gpu-operator-release.yml b/clusters/default/helm/gpu-operator/gpu-operator-release.yml new file mode 100644 index 0000000..a7d924b --- /dev/null +++ b/clusters/default/helm/gpu-operator/gpu-operator-release.yml @@ -0,0 +1,31 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: gpu-operator + namespace: gpu-operator +spec: + interval: 24h + chart: + spec: + chart: gpu-operator + version: "v25.3.4" + sourceRef: + kind: HelmRepository + name: nvidia + namespace: flux-system + interval: 24h + install: + createNamespace: true + upgrade: + remediation: + remediateLastFailure: true + values: + driver: + enabled: false + toolkit: + env: + - name: CONTAINERD_SOCKET + value: /run/k3s/containerd/containerd.sock + - name: CONTAINERD_CONFIG + value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml diff --git a/clusters/default/helm/gpu-operator/gpu-operator-repo.yml b/clusters/default/helm/gpu-operator/gpu-operator-repo.yml new file mode 100644 index 0000000..cce718c --- /dev/null +++ b/clusters/default/helm/gpu-operator/gpu-operator-repo.yml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: nvidia + namespace: flux-system +spec: + interval: 24h + url: https://helm.ngc.nvidia.com/nvidia