-
Notifications
You must be signed in to change notification settings - Fork 47
Expand file tree
/
Copy pathdeviceconfig_example.yaml
More file actions
177 lines (151 loc) · 7.45 KB
/
deviceconfig_example.yaml
File metadata and controls
177 lines (151 loc) · 7.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
apiVersion: amd.com/v1alpha1
kind: DeviceConfig
metadata:
# the names for the device plugin, metrics exporter and node labeler pods will be prefixed with this name
name: gpu-operator
# it is highly recommended to use the namespace where AMD GPU Operator is running
namespace: kube-amd-gpu
spec:
driver:
# set to ture for deploying out-of-tree driver with specified ROCm version
# set to false to directly use inbox or pre-installed driver on worker nodes
enable: false
# set to true to add blacklist for the amdgpu inbox driver kernel module, required for spec.driver.enable=true
# set to false to remove blacklist for the amdgpu inbox driver kernel module, required for spec.driver.enable=false
# the reboot of worker node is required to apply the updated blacklist
blacklist: false
# Specify the out-of-tree driver version
# NOTE: Starting from ROCm 7.1 the amdgpu version is using new versioning schema
# please refer to https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html
version: "7.0"
# Specify driver image here
# DO NOT include the image tag as AMD GPU Operator will automatically manage the image tag for you
# e.g. docker.io/username/amdgpu-driver
image: imageregistry.io/username/repo
# Specify the credential for your private registry if it requires credential to get pull/push access
# you can create the docker-registry type secret by running command like:
# kubectl create secret docker-registry mysecret -n kube-amd-gpu --docker-username=xxx --docker-password=xxx
# Make sure you created the secret within the namespace that gpu operator controller is running
#imageRegistrySecret:
# name: mysecret
# Specify the image registry TLS config if you are using insecure registry for managin driver images
#imageRegistryTLS:
# insecure: true
# insecureSkipTLSVerify: true
## (Optional) configure the driver image build within the cluster
#imageBuild:
# # configure the registry to search for base image for building driver
# # e.g. if you are using worker node with ubuntu 22.04 and baseImageRegistry is docker.io
# # image builder will use docker.io/ubuntu:22.04 as base image
# baseImageRegistry: docker.io
# # sourceImageRepo: specify the amdgpu source code image repo for building driver
# # the Operator will decide the image tag based on user provided driver version and system OS version
# # e.g. if you input docker.io/rocm/amdgpu-driver the image tag will be coreos-<rhel version>-<driver version>
# # NOTE: currently only work for OpenShift cluster
# sourceImageRepo: docker.io/rocm/amdgpu-driver
# baseImageRegistryTLS:
# insecure: False # If True, check for the container image using plain HTTP
# insecureSkipTLSVerify: False # If True, skip any TLS server certificate validation (useful for self-signed certificates)
# Specify the image signing config for building + signing image within cluster
#imageSign:
# keySecret:
# name: mysignkey
# certSecret:
# name: mysigncert
devicePlugin:
# Enable device plugin (set to true to enable, false to disable)
# Default is false, but typically you want this enabled
enableDevicePlugin: true
# Specify the device plugin image
# default value is rocm/k8s-device-plugin:latest
devicePluginImage: rocm/k8s-device-plugin:latest
# Specify the node labeller image
# default value is rocm/k8s-device-plugin:labeller-latest
nodeLabellerImage: rocm/k8s-device-plugin:labeller-latest
# Specify to enable/disable the node labeller
# node labeller is required for adding / removing blacklist config of amdgpu kernel module
# please set to true if you want to blacklist the inbox driver and use our-of-tree driver
enableNodeLabeller: true
# Specify Node Labeller image pull policy
# default value is IfNotPresent for valid tags, Always for no tag or "latest" tag
nodeLabellerImagePullPolicy: "Always"
# Specify the DRA driver config (Dynamic Resource Allocation)
draDriver:
# To enable/disable the DRA driver, disabled by default
# Only enable if you need DRA support (requires Kubernetes 1.32+)
enable: false
# DRA driver image
# image: rocm/k8s-dra-driver:latest
# DRA driver image pull policy
# imagePullPolicy: "IfNotPresent"
# DRA driver node selector, if not specified it will reuse spec.selector
# selector: {}
# Specify the metrics exporter config
metricsExporter:
# To enable/disable the metrics exporter, disabled by default
enable: true
# configure a node selector for metrics exporter
# if not specified metrics exporter will use the node selector from spec.selector by default
#selector:
# feature.node.kubernetes.io/amd-gpu: "true"
# podAnnotations for metrics exporter
podAnnotations: {}
# serviceAnnotations for metrics exporter
serviceAnnotations: {}
# kubernetes service type for metrics exporter, clusterIP(default) or NodePort
serviceType: "NodePort"
# internal service port used for in-cluster and node access to pull metrics from the metrics-exporter (default 5000)
port: 5000
# Node port for metrics exporter service, metrics endpoint $node-ip:$nodePort
nodePort: 32500
# exporter image
image: docker.io/rocm/device-metrics-exporter:v1.4.1
# image pull policy for metrics exporter
# default value is IfNotPresent for valid tags, Always for no tag or "latest" tag
imagePullPolicy: "IfNotPresent"
# metrics config in configmap
# config:
# configmap name, example config in example/metricsExporter/config.json
# name: gpu-config
# Specify the testrunner config
testRunner:
# To enable/disable the testrunner, disabled by default
enable: True
# testrunner image
image: docker.io/rocm/test-runner:v1.4.1
# image pull policy for the testrunner
# default value is IfNotPresent for valid tags, Always for no tag or "latest" tag
imagePullPolicy: "IfNotPresent"
# specify the mount for test logs
logsLocation:
# mount path inside test runner container
mountPath: "/var/log/amd-test-runner"
# host path to be mounted into test runner container
hostPath: "/var/log/amd-test-runner"
# list of secrets that contain connectivity info to cloud providers
#logsExportSecrets:
#- name: azure-secret
#- name: aws-secret
configManager:
# To enable/disable the config manager, enable to partition
enable: True
# image for the device-config-manager container
image: docker.io/rocm/device-config-manager:v1.4.1
# image pull policy for config manager set to always to pull image of latest version
imagePullPolicy: IfNotPresent
# specify configmap name which stores profile config info
#config:
# name: "config-manager-config"
# DCM pod deployed either as a standalone pod or through the GPU operator will have
# a toleration attached to it. User can specify additional tolerations if required
# key: amd-dcm , value: up , Operator: Equal, effect: NoExecute
# OPTIONAL
# toleration field for dcm pod to bypass nodes with specific taints
#configManagerTolerations:
# - key: "key1"
# operator: "Equal"
# value: "value1"
# effect: "NoExecute"
# Specifythe node to be managed by this DeviceConfig Custom Resource
selector:
feature.node.kubernetes.io/amd-gpu: "true"