gpu-operator/example/deviceconfig_example.yaml at 7ffa014643e2aa38784d43c8258d8db0662c46f5 · ROCm/gpu-operator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
apiVersion: amd.com/v1alpha1
kind: DeviceConfig
metadata:
  # the names for the device plugin, metrics exporter and node labeler pods will be prefixed with this name
  name: gpu-operator
  # it is highly recommended to use the namespace where AMD GPU Operator is running
  namespace: kube-amd-gpu
spec:
  driver:
    # set to ture for deploying out-of-tree driver with specified ROCm version
    # set to false to directly use inbox or pre-installed driver on worker nodes
    enable: false

    # set to true to add blacklist for the amdgpu inbox driver kernel module, required for spec.driver.enable=true
    # set to false to remove blacklist for the amdgpu inbox driver kernel module, required for spec.driver.enable=false
    # the reboot of worker node is required to apply the updated blacklist
    blacklist: false

    # Specify the out-of-tree driver version
    # NOTE: Starting from ROCm 7.1 the amdgpu version is using new versioning schema
    # please refer to https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/user-kernel-space-compat-matrix.html
    version: "7.0"

    # Specify driver image here
    # DO NOT include the image tag as AMD GPU Operator will automatically manage the image tag for you
    # e.g. docker.io/username/amdgpu-driver
    image: imageregistry.io/username/repo

    # Specify the credential for your private registry if it requires credential to get pull/push access
    # you can create the docker-registry type secret by running command like:
    # kubectl create secret docker-registry mysecret -n kube-amd-gpu --docker-username=xxx --docker-password=xxx
    # Make sure you created the secret within the namespace that gpu operator controller is running
    #imageRegistrySecret:
    #  name: mysecret

    # Specify the image registry TLS config if you are using insecure registry for managin driver images
    #imageRegistryTLS:
    #  insecure: true
    #  insecureSkipTLSVerify: true

    ## (Optional) configure the driver image build within the cluster
    #imageBuild:
    #  # configure the registry to search for base image for building driver
    #  # e.g. if you are using worker node with ubuntu 22.04 and baseImageRegistry is docker.io
    #  # image builder will use docker.io/ubuntu:22.04 as base image
    #  baseImageRegistry: docker.io
    #  # sourceImageRepo: specify the amdgpu source code image repo for building driver
    #  # the Operator will decide the image tag based on user provided driver version and system OS version
    #  # e.g. if you input docker.io/rocm/amdgpu-driver the image tag will be coreos-<rhel version>-<driver version>
    #  # NOTE: currently only work for OpenShift cluster
    #  sourceImageRepo: docker.io/rocm/amdgpu-driver
    #  baseImageRegistryTLS:
    #    insecure: False # If True, check for the container image using plain HTTP
    #    insecureSkipTLSVerify: False # If True, skip any TLS server certificate validation (useful for self-signed certificates)

    # Specify the image signing config for building + signing image within cluster
    #imageSign:
    #  keySecret:
    #    name: mysignkey
    #  certSecret:
    #    name: mysigncert

  devicePlugin:
    # Enable device plugin (set to true to enable, false to disable)
    # Default is false, but typically you want this enabled
    enableDevicePlugin: true
    # Specify the device plugin image
    # default value is rocm/k8s-device-plugin:latest
    devicePluginImage: rocm/k8s-device-plugin:latest

    # Specify the node labeller image
    # default value is rocm/k8s-device-plugin:labeller-latest
    nodeLabellerImage: rocm/k8s-device-plugin:labeller-latest

    # Specify to enable/disable the node labeller
    # node labeller is required for adding / removing blacklist config of amdgpu kernel module
    # please set to true if you want to blacklist the inbox driver and use our-of-tree driver
    enableNodeLabeller: true

    # Specify Node Labeller image pull policy
    # default value is IfNotPresent for valid tags, Always for no tag or "latest" tag
    nodeLabellerImagePullPolicy: "Always"

  # Specify the DRA driver config (Dynamic Resource Allocation)
  draDriver:
    # To enable/disable the DRA driver, disabled by default
    # Only enable if you need DRA support (requires Kubernetes 1.32+)
    enable: false
    # DRA driver image
    # image: rocm/k8s-dra-driver:latest
    # DRA driver image pull policy
    # imagePullPolicy: "IfNotPresent"
    # DRA driver node selector, if not specified it will reuse spec.selector
    # selector: {}

  # Specify the metrics exporter config
  metricsExporter:
    # To enable/disable the metrics exporter, disabled by default
    enable: true

    # configure a node selector for metrics exporter
    # if not specified metrics exporter will use the node selector from spec.selector by default
    #selector:
    #  feature.node.kubernetes.io/amd-gpu: "true"

    # podAnnotations for metrics exporter
    podAnnotations: {}
    # serviceAnnotations for metrics exporter
    serviceAnnotations: {}

    # kubernetes service type for metrics exporter, clusterIP(default) or NodePort
    serviceType: "NodePort"

    # internal service port used for in-cluster and node access to pull metrics from the metrics-exporter (default 5000)
    port: 5000

    # Node port for metrics exporter service, metrics endpoint $node-ip:$nodePort
    nodePort: 32500

    # exporter image
    image: docker.io/rocm/device-metrics-exporter:v1.4.1
    # image pull policy for metrics exporter
    # default value is IfNotPresent for valid tags, Always for no tag or "latest" tag
    imagePullPolicy: "IfNotPresent"
    # metrics config in configmap
    # config:
      # configmap name, example config in example/metricsExporter/config.json
    #  name: gpu-config

    # Specify the testrunner config
  testRunner:
    # To enable/disable the testrunner, disabled by default
    enable: True

    # testrunner image
    image: docker.io/rocm/test-runner:v1.4.1
    # image pull policy for the testrunner
    # default value is IfNotPresent for valid tags, Always for no tag or "latest" tag
    imagePullPolicy: "IfNotPresent"

    # specify the mount for test logs
    logsLocation:
      # mount path inside test runner container
      mountPath: "/var/log/amd-test-runner"

      # host path to be mounted into test runner container
      hostPath: "/var/log/amd-test-runner"

      # list of secrets that contain connectivity info to cloud providers
      #logsExportSecrets:
      #- name: azure-secret
      #- name: aws-secret

  configManager:
    # To enable/disable the config manager, enable to partition
    enable: True
    # image for the device-config-manager container
    image: docker.io/rocm/device-config-manager:v1.4.1
    # image pull policy for config manager set to always to pull image of latest version
    imagePullPolicy: IfNotPresent
    # specify configmap name which stores profile config info
    #config:
    #  name: "config-manager-config"
    # DCM pod deployed either as a standalone pod or through the GPU operator will have
    # a toleration attached to it. User can specify additional tolerations if required
    # key: amd-dcm , value: up , Operator: Equal, effect: NoExecute
    # OPTIONAL
    # toleration field for dcm pod to bypass nodes with specific taints
    #configManagerTolerations:
    #  - key: "key1"
    #    operator: "Equal"
    #    value: "value1"
    #    effect: "NoExecute"

    # Specifythe node to be managed by this DeviceConfig Custom Resource
  selector:
    feature.node.kubernetes.io/amd-gpu: "true"