llm-engine/charts/model-engine/values_sample.yaml at a7074ca36852aba62d264ea82aed5c81d4ec2980 · scaleapi/llm-engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
# This is a YAML-formatted file.

# tag [required] is the LLM Engine docker image tag
tag: 60ac144c55aad971cdd7f152f4f7816ce2fb7d2f
# context is a user-specified deployment tag. Can be used to
context: production
image:
  # gatewayRepository [required] is the docker repository to pull the LLM Engine gateway image from
  gatewayRepository: public.ecr.aws/b2z8n5q1/model-engine
  # builderRepository [required] is the docker repository to pull the LLM Engine endpoint builder image from
  builderRepository: public.ecr.aws/b2z8n5q1/model-engine
  # cacherRepository [required] is the docker repository to pull the LLM Engine cacher image from
  cacherRepository: public.ecr.aws/b2z8n5q1/model-engine
  # forwarderRepository [required] is the docker repository to pull the LLM Engine forwarder image from
  forwarderRepository: public.ecr.aws/b2z8n5q1/model-engine
  # pullPolicy is the docker image pull policy
  pullPolicy: Always

secrets:
  # kubernetesDatabaseSecretName or cloudDatabaseSecretName [required]
  # is the name of the secret that contains the database credentials
  kubernetesDatabaseSecretName: llm-engine-postgres-credentials

# Azure Key Vault name to pull secrets from
keyvaultName: llm-engine-keyvault

db:
  runDbInitScript: false
  runDbMigrationScript: false

# serviceAccount [required] specifies the service account for LLM Engine server deployments (e.g gateway, cache, and builder deployments).
serviceAccount:
  annotations:
    # eks.amazonaws.com/role-arn [required] is the ARN of the IAM role that the service account will assume
    eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/k8s-main-llm-engine
    "helm.sh/hook": pre-install,pre-upgrade
    "helm.sh/hook-weight": "-2"
  namespaces: []

imageBuilderServiceAccount:
  create: true
  annotations:
    # eks.amazonaws.com/role-arn [required] is the ARN of the IAM role that the image builder service account will assume. Needs to have ecr permissions
    eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/k8s-main-llm-engine-image-builder
  # Reads from serviceAccount.namespaces to determine which namespaces to create the image builder service account in

# service specifies the service configuration for the main LLM Engine server. Users should setup their own ingress controller to expose the service.
service:
  type: ClusterIP
  port: 80

# virtualservice specifies the configuration of an Istio VirtualService
virtualservice:
  enabled: true
  annotations: { }
  hostDomains:
    - llm-engine.domain.com
  gateways:
    - default/internal-gateway

hostDomain:
  prefix: http://

# destinationrule specifies the configuration of an Istio DestinationRule
destinationrule:
  enabled: true
  annotations: { }

# replicaCount specifies the amount of replica pods for each deployment
replicaCount:
  # gateway is the main LLM Engine server deployment
  gateway: 2
  # cacher is the kubernetes state caching deployment
  cacher: 1
  # builder is the endpoint builder deployment
  builder: 1

balloonConfig:
  # If set to true, only high priority pods can preempt balloons. Otherwise, all pods can preempt balloons.
  reserveHighPriority: true

balloons:
  # A low priority pod deployment for A10 GPU nodes
  - acceleratorName: nvidia-ampere-a10
    replicaCount: 0
  # A low priority pod deployment for A100 GPU nodes
  - acceleratorName: nvidia-ampere-a100
    replicaCount: 0
  # A low priority pod deployment for CPU nodes
  - acceleratorName: cpu
    replicaCount: 0
  # A low priority pod deployment for T4 GPU nodes
  - acceleratorName: nvidia-tesla-t4
    replicaCount: 0
  # A low priority pod deployment for H100 GPU nodes
  - acceleratorName: nvidia-hopper-h100
    replicaCount: 0
    gpuCount: 4

# autoscaling is the autoscaling configuration for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
autoscaling:
  horizontal:
    enabled: true
    minReplicas: 2
    maxReplicas: 10
    targetConcurrency: 50
  vertical:
    enabled: false
  prewarming:
    enabled: false

# for async endpoints, Celery autoscaler scales the number of pods based on number of requests
# num_shards is number of instances of the autoscaler
celery_autoscaler:
  enabled: true
  num_shards: 3

podDisruptionBudget:
  enabled: true
  minAvailable: 1

# resources specify the k8s resources for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
resources:
  requests:
    cpu: 2
# nodeSelector specifies the node selector for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
nodeSelector: { }
# tolerations specifies the tolerations for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
tolerations: [ ]
# affinity specifies the affinity for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
affinity: { }

# aws specifies the AWS configurations (by configMap) for LLM Engine server deployments
aws:
  configMap:
    name: default-config
    create: true
  profileName: default

# serviceTemplate specifies additional flags for model endpoints
serviceTemplate:
  securityContext:
    capabilities:
      drop:
        - all
  mountInfraConfig: true
  # createServiceAccount/serviceAccountName/serviceAccountAnnotations specify whether to create a serviceAccount for
  # inference pods. Assumes the inference pods run in a separate namespace to the LLM Engine control plane.
  createServiceAccount: true
  serviceAccountName: model-engine
  serviceAccountAnnotations:
    eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/llm-engine
    "helm.sh/hook": pre-install,pre-upgrade
    "helm.sh/hook-weight": "-2"

# config specifes the `data` field of the service config map
config:
  values:
    infra:
      # cloud_provider [required]; either "aws", "azure", or "gcp"
      cloud_provider: aws
      # k8s_cluster_name [required] is the name of the k8s cluster
      k8s_cluster_name: main_cluster
      # dns_host_domain [required] is the domain name of the k8s cluster
      dns_host_domain: llm-engine.domain.com
      # default_region [required] is the default AWS region for various resources (e.g ECR)
      default_region: us-east-1
      # ml_account_id [required] is the AWS account ID for various resources (e.g ECR) if cloud_provider is "aws", and the GCP project ID if cloud_provider is "gcp"
      ml_account_id: "000000000000"
      # docker_repo_prefix [required] is the prefix for AWS ECR repositories, GCP Artifact Registry repositories, or Azure Container Registry repositories
      docker_repo_prefix: "000000000000.dkr.ecr.us-east-1.amazonaws.com"
      # redis_host [required if redis_aws_secret_name not present] is the hostname of the redis cluster you wish to connect
      redis_host: llm-engine-prod-cache.use1.cache.amazonaws.com
      # redis_aws_secret_name [optional] is the AWS secret that contains the connection info of the Redis cluster.
      # The information provided should be as follows:
      # scheme: either redis:// or rediss://, will default to redis://
      # auth_token (optional): an auth token for the Redis cluster
      # host: the hostname of the Redis cluster
      # port: the port of the Redis cluster
      # query_params (optional): additional query parameters for the Redis cluster, will default to ""
      # The url will be built as follows:
      # {scheme}{host}:{port}/{db_index}{query_params} if auth_token is not provided,
      # {scheme}:{auth_token}@{host}:{port}/{db_index}{query_params} if auth_token is provided
      # db_index will be filled in by LLM Engine.
      # This secret must be accessible by the default LLM Engine AWS role
      # e.g. what is set by profile_ml_worker if provided
      # redis_aws_secret_name: sample-prod/redis-credentials
      # s3_bucket [required] is the S3 bucket you wish to connect
      s3_bucket: "llm-engine"
      # DB engine configs (This is SQLAlchemy heavy)
      db_engine_pool_size: 10
      db_engine_max_overflow: 10
      db_engine_echo: false
      db_engine_echo_pool: false
      db_engine_disconnect_strategy: "pessimistic"
      # prometheus_server_address [optional, required if you want scale from zero for sync/streaming endpoints]
      # is the address of the Prometheus server to query for endpoint metrics
      prometheus_server_address: "http://prometheus-server.istio-system.svc.cluster.local:80"
    launch:
      # endpoint_namespace [required] is K8s namespace the endpoints will be created in
      endpoint_namespace: llm-engine
      # cache_redis_aws_url is the full url for the redis cluster you wish to connect,
      # cache_redis_azure_host is the redis cluster host when using cloud_provider azure
      # cache_redis_aws_secret_name is an AWS secret that contains the Redis credentials.
      # It has a field "cache-url" with the full URL of the Redis cluster (including db number).
      # Other fields are ignored; e.g. you can use the secret for multiple purposes.
      # This secret must be accessible by the default LLM Engine AWS role
      # exactly one of cache_redis_aws_url, cache_redis_azure_host, or cache_redis_aws_secret_name must be provided
      cache_redis_aws_url: redis://llm-engine-prod-cache.use1.cache.amazonaws.com:6379/15
      cache_redis_azure_host: llm-engine-cache.redis.cache.windows.net:6380
      cache_redis_aws_secret_name: sample-prod/redis-credentials
      # s3_file_llm_fine_tuning_job_repository [required] is the S3 URI for the S3 bucket/key that you wish to save fine-tuned assests
      s3_file_llm_fine_tuning_job_repository: "s3://llm-engine/llm-ft-job-repository"
      # dd_trace_enabled specifies whether to enable datadog tracing, datadog must be installed in the cluster
      dd_trace_enabled: false
      istio_enabled: true
      sensitive_log_mode: false

      # Asynchronous endpoints configs (coming soon)
      sqs_profile: default
      # sqs_queue_policy_template [required] is the IAM policy template for SQS queue for async endpoints.
      sqs_queue_policy_template: >
        {
          "Version": "2012-10-17",
          "Id": "__default_policy_ID",
          "Statement": [
            {
              "Sid": "__owner_statement",
              "Effect": "Allow",
              "Principal": {
                "AWS": "arn:aws:iam::000000000000:root"
              },
              "Action": "sqs:*",
              "Resource": "arn:aws:sqs:us-east-1:000000000000:${queue_name}"
            },
            {
              "Effect": "Allow",
              "Principal": {
                "AWS": "arn:aws:iam::000000000000:role/k8s-main-llm-engine"
              },
              "Action": "sqs:*",
              "Resource": "arn:aws:sqs:us-east-1:000000000000:${queue_name}"
            }
          ]
        }

      sqs_queue_tag_template: >
        {
          "Spellbook-Serve-Endpoint-Id": "${endpoint_id}",
          "Spellbook-Serve-Endpoint-Name": "${endpoint_name}",
          "Spellbook-Serve-Endpoint-Created-By": "${endpoint_created_by}"
        }
      billing_queue_arn: "unused"
      model_primitive_host: "unused"
      hf_user_fine_tuned_weights_prefix: "s3://llm-engine/fine_tuned_weights"
      sensitive_log_mode: false
      tgi_repository: "text-generation-inference"
      vllm_repository: "vllm"
      lightllm_repository: "lightllm"
      tensorrt_llm_repository: "tensorrt-llm"
      batch_inference_vllm_repository: "llm-engine/batch-infer-vllm"
      user_inference_base_repository: "launch/inference"
      user_inference_pytorch_repository: "launch/inference/pytorch"
      user_inference_tensorflow_repository: "launch/inference/tf"
      docker_image_layer_cache_repository: "launch-docker-build-cache"

# Triton enhanced endpoints (coming soon)
triton:
  image:
    repository: 000000000000.dkr.ecr.us-west-2.amazonaws.com/std-ml-srv
    tag: e83eccbc8959f90ebbe4bda618b61ec6ee2d8394-triton

# imageCache specifies the image cache configuration for faster endpoint auto-scaling (coming soon)
imageCache:
  devices:
    - name: cpu
      nodeSelector:
        cpu-only: "true"
    - name: a10
      nodeSelector:
        k8s.amazonaws.com/accelerator: nvidia-ampere-a10
      tolerations:
        - key: "nvidia.com/gpu"
          operator: "Exists"
          effect: "NoSchedule"
    - name: a100
      nodeSelector:
        k8s.amazonaws.com/accelerator: nvidia-ampere-a100
      tolerations:
        - key: "nvidia.com/gpu"
          operator: "Exists"
          effect: "NoSchedule"
    - name: t4
      nodeSelector:
        k8s.amazonaws.com/accelerator: nvidia-tesla-t4
      tolerations:
        - key: "nvidia.com/gpu"
          operator: "Exists"
          effect: "NoSchedule"
    - name: h100
      nodeSelector:
        k8s.amazonaws.com/accelerator: nvidia-hopper-h100
      tolerations:
        - key: "nvidia.com/gpu"
          operator: "Exists"
          effect: "NoSchedule"
    - name: h100-1g20gb
      nodeSelector:
        k8s.amazonaws.com/accelerator: nvidia-hopper-h100-1g20gb
      tolerations:
        - key: "nvidia.com/gpu"
          operator: "Exists"
          effect: "NoSchedule"
    - name: h100-3g40gb
      nodeSelector:
        k8s.amazonaws.com/accelerator: nvidia-hopper-h100-3g40gb
      tolerations:
        - key: "nvidia.com/gpu"
          operator: "Exists"
          effect: "NoSchedule"

# celeryBrokerType specifies the celery broker type for async endpoints, either "sqs" or "elasticache"
celeryBrokerType: sqs

datadog:
  enabled: false

recommendedHardware:
  byGpuMemoryGb:
    - gpu_memory_le: 24
      cpus: 10
      gpus: 1
      memory: 24Gi
      storage: 80Gi
      gpu_type: nvidia-ampere-a10
      nodes_per_worker: 1
    - gpu_memory_le: 48
      cpus: 20
      gpus: 2
      memory: 48Gi
      storage: 80Gi
      gpu_type: nvidia-ampere-a10
      nodes_per_worker: 1
    - gpu_memory_le: 96
      cpus: 40
      gpus: 4
      memory: 96Gi
      storage: 96Gi
      gpu_type: nvidia-ampere-a10
      nodes_per_worker: 1
    - gpu_memory_le: 180
      cpus: 20
      gpus: 2
      memory: 160Gi
      storage: 160Gi
      gpu_type: nvidia-hopper-h100
      nodes_per_worker: 1
    - gpu_memory_le: 320
      cpus: 40
      gpus: 4
      memory: 320Gi
      storage: 320Gi
      gpu_type: nvidia-hopper-h100
      nodes_per_worker: 1
    - gpu_memory_le: 640
      cpus: 80
      gpus: 8
      memory: 800Gi
      storage: 640Gi
      gpu_type: nvidia-hopper-h100
      nodes_per_worker: 1
    - gpu_memory_le: 640
      cpus: 80
      gpus: 8
      memory: 800Gi
      storage: 640Gi
      gpu_type: nvidia-hopper-h100
      nodes_per_worker: 2
  byModelName:
    - name: llama-3-8b-instruct-262k
      cpus: 20
      gpus: 2
      memory: 40Gi
      storage: 40Gi
      gpu_type: nvidia-hopper-h100
      nodes_per_worker: 1
    - name: deepseek-coder-v2
      cpus: 160
      gpus: 8
      memory: 800Gi
      storage: 640Gi
      gpu_type: nvidia-hopper-h100
      nodes_per_worker: 1
    - name: deepseek-coder-v2-instruct
      cpus: 160
      gpus: 8
      memory: 800Gi
      storage: 640Gi
      gpu_type: nvidia-hopper-h100
      nodes_per_worker: 1