-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvalues.yaml
More file actions
111 lines (95 loc) · 2.51 KB
/
values.yaml
File metadata and controls
111 lines (95 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Default values for slm-server.
replicaCount: 1
image:
repository: x3huang/slm-server
pullPolicy: IfNotPresent
# Overridden by the CI/CD pipeline
tag: ""
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
serviceAccount:
create: true
annotations: {}
name: ""
service:
type: ClusterIP
port: 8000
# ServiceMonitor configuration for Prometheus
serviceMonitor:
enabled: true
path: /metrics
interval: 71s
scrapeTimeout: 30s
labels: {}
annotations: {}
relabelings: []
metricRelabelings: []
persistence:
enabled: true
# The absolute path on the host node where the model data is stored.
hostPath: "/mnt/disks/ssd1/slm-data"
# The name of the node where the storage is located.
# This should be left empty and set during deployment.
# nodeName: ""
accessMode: ReadWriteOnce
size: 5Gi # Adjust based on your model size
mountPath: /app/models
# We are not using ingress or hpa for now
ingress:
enabled: false
hpa:
enabled: false
# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 100
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80
# Environment variables to inject into the container
# Example configuration for SLM server settings
env: {}
# Application settings
# SLM_MODEL_PATH: "/app/models/Qwen3-0.6B-Q4_K_M.gguf"
# SLM_N_CTX: "4096"
# SLM_N_THREADS: "2"
# SLM_SEED: "42"
# SLM_S_TIMEOUT: "1"
# Logging settings
# SLM_LOGGING__VERBOSE: "true"
# Tracing settings
# SLM_TRACING__ENABLED: "true"
# SLM_TRACING__ENDPOINT: "https://tempo.example.com/api/traces"
# SLM_TRACING__USERNAME: "your-username"
# SLM_TRACING__PASSWORD: "your-password"
# Resource requests and limits for the container.
# See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
# Tuned for Qwen3-0.6B-Q4_K_M (484 MB) + n_ctx=8192 KV cache (~448 MB) on 1-CPU / 1 GB VPS nodes.
resources:
limits:
cpu: 1
memory: 1Gi
requests:
cpu: 200m
memory: 600Mi
# Readiness and liveness probes configuration
probes:
readiness:
enabled: true
path: /health
initialDelaySeconds: 10
periodSeconds: 70
timeoutSeconds: 30
successThreshold: 1
failureThreshold: 5
liveness:
enabled: true
path: /health
initialDelaySeconds: 30
periodSeconds: 70
timeoutSeconds: 30
successThreshold: 1
failureThreshold: 5
strategy:
type: Recreate