cems/docker-compose.yml at main · Chocksy/cems · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
services:
  # PostgreSQL with pgvector for unified storage (vectors + metadata)
  postgres:
    image: pgvector/pgvector:pg16
    container_name: cems-postgres
    environment:
      POSTGRES_USER: cems
      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-cems_secure_password}
      POSTGRES_DB: cems
    volumes:
      - postgres_data:/var/lib/postgresql/data
      - ./deploy/init.sql:/docker-entrypoint-initdb.d/init.sql
    ports:
      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U cems"]
      interval: 10s
      timeout: 5s
      retries: 5
    restart: unless-stopped

  # llama.cpp server for embeddings (768-dim) - Embedding Gemma 300M
  llama-embed:
    image: ghcr.io/ggml-org/llama.cpp:server
    container_name: cems-llama-embed
    command:
      - --model
      - /models/embeddinggemma-300M-Q8_0.gguf
      - --embeddings
      - --host
      - "0.0.0.0"
      - --port
      - "8081"
      - --ctx-size
      - "8192"
      - --batch-size
      - "4096"
      - --ubatch-size
      - "4096"
    volumes:
      - llama_models:/models
    ports:
      - "8081:8081"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8081/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 60s
    restart: unless-stopped

  # llama.cpp server for reranking
  llama-rerank:
    image: ghcr.io/ggml-org/llama.cpp:server
    container_name: cems-llama-rerank
    command:
      - --model
      - /models/Qwen3-Reranker-0.6B.Q8_0.gguf
      - --rerank
      - --host
      - "0.0.0.0"
      - --port
      - "8082"
      - --ctx-size
      - "512"
    volumes:
      - llama_models:/models
    ports:
      - "8082:8082"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8082/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 60s
    restart: unless-stopped

  # CEMS Python API Server
  cems-server:
    image: chocksy/cems-server:latest
    pull_policy: always
    container_name: cems-server
    environment:
      # Database (PostgreSQL with pgvector for vectors + metadata)
      CEMS_DATABASE_URL: postgresql://cems:${POSTGRES_PASSWORD:-cems_secure_password}@postgres:5432/cems
      # LLM (via OpenRouter)
      OPENROUTER_API_KEY: ${OPENROUTER_API_KEY}
      # LLM models — override to control costs (OpenRouter format)
      # CEMS_LLM_MODEL: qwen/qwen3-32b              # Maintenance jobs (lint, consolidation)
      # CEMS_AGENTIC_MODEL: google/gemini-2.5-flash-lite  # Agentic search (needs 1M+ context)
      # Server mode
      CEMS_MODE: server
      CEMS_SERVER_HOST: 0.0.0.0
      CEMS_SERVER_PORT: 8765
      # Admin API key (for user management)
      CEMS_ADMIN_KEY: ${CEMS_ADMIN_KEY}
      # Embeddings via OpenRouter (1536-dim, matches production)
      CEMS_EMBEDDING_BACKEND: openrouter
      CEMS_EMBEDDING_DIMENSION: 1536
      # Reranker: llm (OpenRouter API), llamacpp_server, or disabled
      # BOTH RERANKERS HURT PERFORMANCE SIGNIFICANTLY:
      # - LLM reranker (2026-02-04): 88% → 81% (-7%)
      # - llamacpp_server reranker (2026-02-05): 86% → 28% (-58%)
      CEMS_RERANKER_BACKEND: disabled
    ports:
      - "8765:8765"
    depends_on:
      postgres:
        condition: service_healthy
      # llama-embed and llama-rerank run NATIVELY on Mac for 210x speedup
      # Start them with: ./scripts/start-native-embedder.sh & ./scripts/start-native-reranker.sh
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8765/health"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped

  # Express MCP Wrapper (public-facing)
  cems-mcp:
    build:
      context: ./mcp-wrapper
      dockerfile: Dockerfile
    container_name: cems-mcp
    environment:
      PYTHON_API_URL: http://cems-server:8765
      PORT: 8766
    ports:
      - "8766:8766"
    depends_on:
      cems-server:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "node", "-e", "require('http').get('http://localhost:8766/health', (r) => {process.exit(r.statusCode === 200 ? 0 : 1)})"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped

volumes:
  postgres_data:
  llama_models: