nilAI/docker/compose/docker-compose.qwen-2b-gpu.ci.yml at 2fb6bbb01b4f592b4ededce75d032ec301420a02 · NillionNetwork/nilAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
version: "3.8"

services:
  c:
    image: nillion/nilai-vllm:latest
    container_name: qwen2vl_2b_gpu
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    ulimits:
      memlock: -1
      stack: 67108864
    env_file:
      - .env
    restart: unless-stopped
    depends_on:
      redis:
        condition: service_healthy
    command:
      [
        "--model", "Qwen/Qwen2-VL-2B-Instruct-AWQ",
        "--model-impl", "vllm",
        "--tensor-parallel-size", "1",
        "--trust-remote-code",
        "--quantization", "awq",

        "--max-model-len", "1280",
        "--max-num-batched-tokens", "1280",
        "--max-num-seqs", "1",

        "--gpu-memory-utilization", "0.75",
        "--swap-space", "8",
        "--uvicorn-log-level", "warning",

        "--limit-mm-per-prompt", "{\"image\":1,\"video\":0}",
        "--skip-mm-profiling",
        "--enforce-eager"
      ]

    environment:
      SVC_URL: http://qwen2vl_2b_gpu:8000
      DISCOVERY_URL: redis://redis:6379
      TOOL_SUPPORT: "true"
      MULTIMODAL_SUPPORT: "true"
      CUDA_LAUNCH_BLOCKING: "1"
      VLLM_ALLOW_LONG_MAX_MODEL_LEN: "1"
      PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
    volumes:
      - hugging_face_models:/root/.cache/huggingface
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      retries: 3
      start_period: 60s
      timeout: 10s

volumes:
  hugging_face_models: