nilAI/docker/compose/docker-compose.llama-3b-gpu.yml at 2fb6bbb01b4f592b4ededce75d032ec301420a02 · NillionNetwork/nilAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
services:
  llama_3b_gpu:
    image: nillion/nilai-vllm:latest
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]

    ulimits:
      memlock: -1
      stack: 67108864
    env_file:
      - .env
    restart: unless-stopped
    depends_on:
      redis:
        condition: service_healthy
    command: >
      --model meta-llama/Llama-3.2-3B-Instruct
      --gpu-memory-utilization 0.5
      --max-model-len 30000
      --max-num-batched-tokens 30000
      --tensor-parallel-size 1
      --enable-auto-tool-choice
      --tool-call-parser llama3_json
      --uvicorn-log-level warning
    environment:
      - SVC_URL=http://llama_3b_gpu:8000
      - DISCOVERY_URL=redis://redis:6379
      - TOOL_SUPPORT=true
    volumes:
      - hugging_face_models:/root/.cache/huggingface  # cache models
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      retries: 3
      start_period: 60s
      timeout: 10s
volumes:
  hugging_face_models: