-
-
Notifications
You must be signed in to change notification settings - Fork 47
Expand file tree
/
Copy pathbasic.yaml
More file actions
146 lines (146 loc) · 2.98 KB
/
basic.yaml
File metadata and controls
146 lines (146 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
apiVersion: llmaz.io/v1alpha1
kind: OpenModel
metadata:
name: qwen2-0--5b
spec:
familyName: qwen2
source:
modelHub:
modelID: Qwen/Qwen2-0.5B-Instruct-GGUF
filename: qwen2-0_5b-instruct-q5_k_m.gguf
---
apiVersion: inference.llmaz.io/v1alpha1
kind: Playground
metadata:
name: qwen2-0--5b
spec:
replicas: 1
modelClaim:
modelName: qwen2-0--5b
backendRuntimeConfig:
backendName: llamacpp
configName: default
args:
- -fa # use flash attention
---
apiVersion: llmaz.io/v1alpha1
kind: OpenModel
metadata:
name: qwen2--5-coder
spec:
familyName: qwen2
source:
modelHub:
modelID: Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF
filename: qwen2.5-coder-0.5b-instruct-q2_k.gguf
---
apiVersion: inference.llmaz.io/v1alpha1
kind: Playground
metadata:
name: qwen2--5-coder
spec:
replicas: 1
modelClaim:
modelName: qwen2--5-coder
backendRuntimeConfig:
backendName: llamacpp
configName: default
args:
- -fa # use flash attention
---
apiVersion: gateway.networking.k8s.io/v1
kind: GatewayClass
metadata:
name: default-envoy-ai-gateway
spec:
controllerName: gateway.envoyproxy.io/gatewayclass-controller
---
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: default-envoy-ai-gateway
spec:
gatewayClassName: default-envoy-ai-gateway
listeners:
- name: http
protocol: HTTP
port: 80
---
apiVersion: aigateway.envoyproxy.io/v1alpha1
kind: AIGatewayRoute
metadata:
name: default-envoy-ai-gateway
spec:
schema:
name: OpenAI
targetRefs:
- name: default-envoy-ai-gateway
kind: Gateway
group: gateway.networking.k8s.io
rules:
- matches:
- headers:
- type: Exact
name: x-ai-eg-model
value: qwen2-0--5b
backendRefs:
- name: qwen2-0--5b
modelsOwnedBy: "llmaz"
timeouts:
request: 3m
- matches:
- headers:
- type: Exact
name: x-ai-eg-model
value: qwen2--5-coder
backendRefs:
- name: qwen2--5-coder
modelsOwnedBy: "llmaz"
timeouts:
request: 3m
---
apiVersion: aigateway.envoyproxy.io/v1alpha1
kind: AIServiceBackend
metadata:
name: qwen2-0--5b
spec:
schema:
name: OpenAI
backendRef:
name: qwen2-0--5b
kind: Backend
group: gateway.envoyproxy.io
---
apiVersion: aigateway.envoyproxy.io/v1alpha1
kind: AIServiceBackend
metadata:
name: qwen2--5-coder
spec:
schema:
name: OpenAI
backendRef:
name: qwen2--5-coder
kind: Backend
group: gateway.envoyproxy.io
---
apiVersion: gateway.envoyproxy.io/v1alpha1
kind: Backend
metadata:
name: qwen2-0--5b
namespace: default
spec:
endpoints:
- fqdn:
hostname: qwen2-0--5b-lb.default.svc.cluster.local
port: 8080
---
apiVersion: gateway.envoyproxy.io/v1alpha1
kind: Backend
metadata:
name: qwen2--5-coder
namespace: default
spec:
endpoints:
- fqdn:
hostname: qwen2--5-coder-lb.default.svc.cluster.local
port: 8080