-
Notifications
You must be signed in to change notification settings - Fork 74
Expand file tree
/
Copy pathvalues_sample.yaml
More file actions
400 lines (373 loc) · 14.8 KB
/
values_sample.yaml
File metadata and controls
400 lines (373 loc) · 14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
# This is a YAML-formatted file.
# tag [required] is the LLM Engine docker image tag
tag: 60ac144c55aad971cdd7f152f4f7816ce2fb7d2f
# context is a user-specified deployment tag. Can be used to
context: production
image:
# gatewayRepository [required] is the docker repository to pull the LLM Engine gateway image from
gatewayRepository: public.ecr.aws/b2z8n5q1/model-engine
# builderRepository [required] is the docker repository to pull the LLM Engine endpoint builder image from
builderRepository: public.ecr.aws/b2z8n5q1/model-engine
# cacherRepository [required] is the docker repository to pull the LLM Engine cacher image from
cacherRepository: public.ecr.aws/b2z8n5q1/model-engine
# forwarderRepository [required] is the docker repository to pull the LLM Engine forwarder image from
forwarderRepository: public.ecr.aws/b2z8n5q1/model-engine
# pullPolicy is the docker image pull policy
pullPolicy: Always
secrets:
# kubernetesDatabaseSecretName or cloudDatabaseSecretName [required]
# is the name of the secret that contains the database credentials
kubernetesDatabaseSecretName: llm-engine-postgres-credentials
# Azure Key Vault name to pull secrets from
keyvaultName: llm-engine-keyvault
db:
runDbInitScript: false
runDbMigrationScript: false
# serviceAccount [required] specifies the service account for LLM Engine server deployments (e.g gateway, cache, and builder deployments).
serviceAccount:
annotations:
# eks.amazonaws.com/role-arn [required] is the ARN of the IAM role that the service account will assume
eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/k8s-main-llm-engine
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-2"
namespaces: []
imageBuilderServiceAccount:
create: true
annotations:
# eks.amazonaws.com/role-arn [required] is the ARN of the IAM role that the image builder service account will assume. Needs to have ecr permissions
eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/k8s-main-llm-engine-image-builder
# Reads from serviceAccount.namespaces to determine which namespaces to create the image builder service account in
# service specifies the service configuration for the main LLM Engine server. Users should setup their own ingress controller to expose the service.
service:
type: ClusterIP
port: 80
# virtualservice specifies the configuration of an Istio VirtualService
virtualservice:
enabled: true
annotations: { }
hostDomains:
- llm-engine.domain.com
gateways:
- default/internal-gateway
hostDomain:
prefix: http://
# destinationrule specifies the configuration of an Istio DestinationRule
destinationrule:
enabled: true
annotations: { }
# replicaCount specifies the amount of replica pods for each deployment
replicaCount:
# gateway is the main LLM Engine server deployment
gateway: 2
# cacher is the kubernetes state caching deployment
cacher: 1
# builder is the endpoint builder deployment
builder: 1
balloonConfig:
# If set to true, only high priority pods can preempt balloons. Otherwise, all pods can preempt balloons.
reserveHighPriority: true
balloons:
# A low priority pod deployment for A10 GPU nodes
- acceleratorName: nvidia-ampere-a10
replicaCount: 0
# A low priority pod deployment for A100 GPU nodes
- acceleratorName: nvidia-ampere-a100
replicaCount: 0
# A low priority pod deployment for CPU nodes
- acceleratorName: cpu
replicaCount: 0
# A low priority pod deployment for T4 GPU nodes
- acceleratorName: nvidia-tesla-t4
replicaCount: 0
# A low priority pod deployment for H100 GPU nodes
- acceleratorName: nvidia-hopper-h100
replicaCount: 0
gpuCount: 4
# autoscaling is the autoscaling configuration for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
autoscaling:
horizontal:
enabled: true
minReplicas: 2
maxReplicas: 10
targetConcurrency: 50
vertical:
enabled: false
prewarming:
enabled: false
# for async endpoints, Celery autoscaler scales the number of pods based on number of requests
# num_shards is number of instances of the autoscaler
celery_autoscaler:
enabled: true
num_shards: 3
podDisruptionBudget:
enabled: true
minAvailable: 1
# resources specify the k8s resources for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
resources:
requests:
cpu: 2
# nodeSelector specifies the node selector for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
nodeSelector: { }
# tolerations specifies the tolerations for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
tolerations: [ ]
# affinity specifies the affinity for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
affinity: { }
# aws specifies the AWS configurations (by configMap) for LLM Engine server deployments
aws:
configMap:
name: default-config
create: true
profileName: default
# serviceTemplate specifies additional flags for model endpoints
serviceTemplate:
securityContext:
capabilities:
drop:
- all
mountInfraConfig: true
# createServiceAccount/serviceAccountName/serviceAccountAnnotations specify whether to create a serviceAccount for
# inference pods. Assumes the inference pods run in a separate namespace to the LLM Engine control plane.
createServiceAccount: true
serviceAccountName: model-engine
serviceAccountAnnotations:
eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/llm-engine
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-2"
# config specifes the `data` field of the service config map
config:
values:
infra:
# cloud_provider [required]; either "aws", "azure", or "gcp"
cloud_provider: aws
# k8s_cluster_name [required] is the name of the k8s cluster
k8s_cluster_name: main_cluster
# dns_host_domain [required] is the domain name of the k8s cluster
dns_host_domain: llm-engine.domain.com
# default_region [required] is the default AWS region for various resources (e.g ECR)
default_region: us-east-1
# ml_account_id [required] is the AWS account ID for various resources (e.g ECR) if cloud_provider is "aws", and the GCP project ID if cloud_provider is "gcp"
ml_account_id: "000000000000"
# docker_repo_prefix [required] is the prefix for AWS ECR repositories, GCP Artifact Registry repositories, or Azure Container Registry repositories
docker_repo_prefix: "000000000000.dkr.ecr.us-east-1.amazonaws.com"
# redis_host [required if redis_aws_secret_name not present] is the hostname of the redis cluster you wish to connect
redis_host: llm-engine-prod-cache.use1.cache.amazonaws.com
# redis_aws_secret_name [optional] is the AWS secret that contains the connection info of the Redis cluster.
# The information provided should be as follows:
# scheme: either redis:// or rediss://, will default to redis://
# auth_token (optional): an auth token for the Redis cluster
# host: the hostname of the Redis cluster
# port: the port of the Redis cluster
# query_params (optional): additional query parameters for the Redis cluster, will default to ""
# The url will be built as follows:
# {scheme}{host}:{port}/{db_index}{query_params} if auth_token is not provided,
# {scheme}:{auth_token}@{host}:{port}/{db_index}{query_params} if auth_token is provided
# db_index will be filled in by LLM Engine.
# This secret must be accessible by the default LLM Engine AWS role
# e.g. what is set by profile_ml_worker if provided
# redis_aws_secret_name: sample-prod/redis-credentials
# s3_bucket [required] is the S3 bucket you wish to connect
s3_bucket: "llm-engine"
# DB engine configs (This is SQLAlchemy heavy)
db_engine_pool_size: 10
db_engine_max_overflow: 10
db_engine_echo: false
db_engine_echo_pool: false
db_engine_disconnect_strategy: "pessimistic"
# prometheus_server_address [optional, required if you want scale from zero for sync/streaming endpoints]
# is the address of the Prometheus server to query for endpoint metrics
prometheus_server_address: "http://prometheus-server.istio-system.svc.cluster.local:80"
launch:
# endpoint_namespace [required] is K8s namespace the endpoints will be created in
endpoint_namespace: llm-engine
# cache_redis_aws_url is the full url for the redis cluster you wish to connect,
# cache_redis_azure_host is the redis cluster host when using cloud_provider azure
# cache_redis_aws_secret_name is an AWS secret that contains the Redis credentials.
# It has a field "cache-url" with the full URL of the Redis cluster (including db number).
# Other fields are ignored; e.g. you can use the secret for multiple purposes.
# This secret must be accessible by the default LLM Engine AWS role
# exactly one of cache_redis_aws_url, cache_redis_azure_host, or cache_redis_aws_secret_name must be provided
cache_redis_aws_url: redis://llm-engine-prod-cache.use1.cache.amazonaws.com:6379/15
cache_redis_azure_host: llm-engine-cache.redis.cache.windows.net:6380
cache_redis_aws_secret_name: sample-prod/redis-credentials
# s3_file_llm_fine_tuning_job_repository [required] is the S3 URI for the S3 bucket/key that you wish to save fine-tuned assests
s3_file_llm_fine_tuning_job_repository: "s3://llm-engine/llm-ft-job-repository"
# dd_trace_enabled specifies whether to enable datadog tracing, datadog must be installed in the cluster
dd_trace_enabled: false
istio_enabled: true
sensitive_log_mode: false
# Asynchronous endpoints configs (coming soon)
sqs_profile: default
# sqs_queue_policy_template [required] is the IAM policy template for SQS queue for async endpoints.
sqs_queue_policy_template: >
{
"Version": "2012-10-17",
"Id": "__default_policy_ID",
"Statement": [
{
"Sid": "__owner_statement",
"Effect": "Allow",
"Principal": {
"AWS": "arn:aws:iam::000000000000:root"
},
"Action": "sqs:*",
"Resource": "arn:aws:sqs:us-east-1:000000000000:${queue_name}"
},
{
"Effect": "Allow",
"Principal": {
"AWS": "arn:aws:iam::000000000000:role/k8s-main-llm-engine"
},
"Action": "sqs:*",
"Resource": "arn:aws:sqs:us-east-1:000000000000:${queue_name}"
}
]
}
sqs_queue_tag_template: >
{
"Spellbook-Serve-Endpoint-Id": "${endpoint_id}",
"Spellbook-Serve-Endpoint-Name": "${endpoint_name}",
"Spellbook-Serve-Endpoint-Created-By": "${endpoint_created_by}"
}
billing_queue_arn: "unused"
model_primitive_host: "unused"
hf_user_fine_tuned_weights_prefix: "s3://llm-engine/fine_tuned_weights"
sensitive_log_mode: false
tgi_repository: "text-generation-inference"
vllm_repository: "vllm"
lightllm_repository: "lightllm"
tensorrt_llm_repository: "tensorrt-llm"
batch_inference_vllm_repository: "llm-engine/batch-infer-vllm"
user_inference_base_repository: "launch/inference"
user_inference_pytorch_repository: "launch/inference/pytorch"
user_inference_tensorflow_repository: "launch/inference/tf"
docker_image_layer_cache_repository: "launch-docker-build-cache"
# Triton enhanced endpoints (coming soon)
triton:
image:
repository: 000000000000.dkr.ecr.us-west-2.amazonaws.com/std-ml-srv
tag: e83eccbc8959f90ebbe4bda618b61ec6ee2d8394-triton
# imageCache specifies the image cache configuration for faster endpoint auto-scaling (coming soon)
imageCache:
devices:
- name: cpu
nodeSelector:
cpu-only: "true"
- name: a10
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-ampere-a10
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- name: a100
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-ampere-a100
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- name: t4
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-tesla-t4
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- name: h100
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-hopper-h100
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- name: h100-1g20gb
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-hopper-h100-1g20gb
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- name: h100-3g40gb
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-hopper-h100-3g40gb
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
# celeryBrokerType specifies the celery broker type for async endpoints, either "sqs" or "elasticache"
celeryBrokerType: sqs
datadog:
enabled: false
recommendedHardware:
byGpuMemoryGb:
- gpu_memory_le: 24
cpus: 10
gpus: 1
memory: 24Gi
storage: 80Gi
gpu_type: nvidia-ampere-a10
nodes_per_worker: 1
- gpu_memory_le: 48
cpus: 20
gpus: 2
memory: 48Gi
storage: 80Gi
gpu_type: nvidia-ampere-a10
nodes_per_worker: 1
- gpu_memory_le: 96
cpus: 40
gpus: 4
memory: 96Gi
storage: 96Gi
gpu_type: nvidia-ampere-a10
nodes_per_worker: 1
- gpu_memory_le: 180
cpus: 20
gpus: 2
memory: 160Gi
storage: 160Gi
gpu_type: nvidia-hopper-h100
nodes_per_worker: 1
- gpu_memory_le: 320
cpus: 40
gpus: 4
memory: 320Gi
storage: 320Gi
gpu_type: nvidia-hopper-h100
nodes_per_worker: 1
- gpu_memory_le: 640
cpus: 80
gpus: 8
memory: 800Gi
storage: 640Gi
gpu_type: nvidia-hopper-h100
nodes_per_worker: 1
- gpu_memory_le: 640
cpus: 80
gpus: 8
memory: 800Gi
storage: 640Gi
gpu_type: nvidia-hopper-h100
nodes_per_worker: 2
byModelName:
- name: llama-3-8b-instruct-262k
cpus: 20
gpus: 2
memory: 40Gi
storage: 40Gi
gpu_type: nvidia-hopper-h100
nodes_per_worker: 1
- name: deepseek-coder-v2
cpus: 160
gpus: 8
memory: 800Gi
storage: 640Gi
gpu_type: nvidia-hopper-h100
nodes_per_worker: 1
- name: deepseek-coder-v2-instruct
cpus: 160
gpus: 8
memory: 800Gi
storage: 640Gi
gpu_type: nvidia-hopper-h100
nodes_per_worker: 1