slm-server/deploy/helm/values.yaml at 106cd32d948d3f44d043f630fd2dbd99091e09a2 · XyLearningProgramming/slm-server · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Default values for slm-server.

replicaCount: 1

image:
  repository: x3huang/slm-server
  pullPolicy: IfNotPresent
  # Overridden by the CI/CD pipeline
  tag: ""

imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""

serviceAccount:
  create: true
  annotations: {}
  name: ""

service:
  type: ClusterIP
  port: 8000

# ServiceMonitor configuration for Prometheus
serviceMonitor:
  enabled: true
  path: /metrics
  interval: 71s
  scrapeTimeout: 30s
  labels: {}
  annotations: {}
  relabelings: []
  metricRelabelings: []

persistence:
  enabled: true
  # The absolute path on the host node where the model data is stored.
  hostPath: "/mnt/disks/ssd1/slm-data"
  # The name of the node where the storage is located.
  # This should be left empty and set during deployment.
  # nodeName: ""
  accessMode: ReadWriteOnce
  size: 5Gi # Adjust based on your model size
  mountPath: /app/models

# We are not using ingress or hpa for now
ingress:
  enabled: false

hpa:
  enabled: false

# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/
autoscaling:
  enabled: false
  minReplicas: 1
  maxReplicas: 100
  targetCPUUtilizationPercentage: 80
  # targetMemoryUtilizationPercentage: 80

# Environment variables to inject into the container
# Example configuration for SLM server settings
env: {}
  # Application settings
  # SLM_MODEL_PATH: "/app/models/Qwen3-0.6B-Q4_K_M.gguf"
  # SLM_N_CTX: "4096"
  # SLM_N_THREADS: "2"
  # SLM_SEED: "42"
  # SLM_S_TIMEOUT: "1"

  # Logging settings
  # SLM_LOGGING__VERBOSE: "true"

  # Tracing settings
  # SLM_TRACING__ENABLED: "true"
  # SLM_TRACING__ENDPOINT: "https://tempo.example.com/api/traces"
  # SLM_TRACING__USERNAME: "your-username"
  # SLM_TRACING__PASSWORD: "your-password"

# Resource requests and limits for the container.
# See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
# Tuned for Qwen3-0.6B-Q4_K_M (484 MB) + n_ctx=8192 KV cache (~448 MB) on 1-CPU / 1 GB VPS nodes.
resources:
  limits:
    cpu: 1
    memory: 1Gi
  requests:
    cpu: 200m
    memory: 600Mi

# Readiness and liveness probes configuration
probes:
  readiness:
    enabled: true
    path: /health
    initialDelaySeconds: 10
    periodSeconds: 70
    timeoutSeconds: 30
    successThreshold: 1
    failureThreshold: 5
  liveness:
    enabled: true
    path: /health
    initialDelaySeconds: 30
    periodSeconds: 70
    timeoutSeconds: 30
    successThreshold: 1
    failureThreshold: 5

strategy:
  type: Recreate