-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathdocker-compose.qwen-2b-gpu.ci.yml
More file actions
62 lines (56 loc) · 1.51 KB
/
docker-compose.qwen-2b-gpu.ci.yml
File metadata and controls
62 lines (56 loc) · 1.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
version: "3.8"
services:
c:
image: nillion/nilai-vllm:latest
container_name: qwen2vl_2b_gpu
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
ulimits:
memlock: -1
stack: 67108864
env_file:
- .env
restart: unless-stopped
depends_on:
redis:
condition: service_healthy
command:
[
"--model", "Qwen/Qwen2-VL-2B-Instruct-AWQ",
"--model-impl", "vllm",
"--tensor-parallel-size", "1",
"--trust-remote-code",
"--quantization", "awq",
"--max-model-len", "1280",
"--max-num-batched-tokens", "1280",
"--max-num-seqs", "1",
"--gpu-memory-utilization", "0.75",
"--swap-space", "8",
"--uvicorn-log-level", "warning",
"--limit-mm-per-prompt", "{\"image\":1,\"video\":0}",
"--skip-mm-profiling",
"--enforce-eager"
]
environment:
SVC_URL: http://qwen2vl_2b_gpu:8000
DISCOVERY_URL: redis://redis:6379
TOOL_SUPPORT: "true"
MULTIMODAL_SUPPORT: "true"
CUDA_LAUNCH_BLOCKING: "1"
VLLM_ALLOW_LONG_MAX_MODEL_LEN: "1"
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
volumes:
- hugging_face_models:/root/.cache/huggingface
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
retries: 3
start_period: 60s
timeout: 10s
volumes:
hugging_face_models: