crawl4ai/deploy/docker/config.yml at 494005c819b77e2c6ce35b03cd1bdd8006a3e409 · unclecode/crawl4ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Application Configuration
app:
  title: "Crawl4AI API"
  version: "1.0.0"
  host: "0.0.0.0"
  port: 11235
  reload: False
  workers: 1
  timeout_keep_alive: 300

# Default LLM Configuration
llm:
  provider: "openai/gpt-4o-mini"
  # api_key: sk-...  # If you pass the API key directly (not recommended)

# Redis Configuration
# To use external Redis instead of embedded, set REDIS_URL environment variable:
#   REDIS_URL=redis://:password@hostname:6379/0
# When using external Redis, also set CRAWL4AI_DISABLE_EMBEDDED_REDIS=true
# to prevent the embedded Redis from starting.
redis:
  # uri: "redis://localhost:6379/0"  # Override with full URI (or use REDIS_URL env var)
  host: "localhost"
  port: 6379
  db: 0
  password: ""
  ssl: False
  ssl_cert_reqs: None
  ssl_ca_certs: None
  ssl_certfile: None
  ssl_keyfile: None

# Rate Limiting Configuration
rate_limiting:
  enabled: True
  default_limit: "1000/minute"
  trusted_proxies: []
  storage_uri: "memory://"  # Use "redis://localhost:6379" for production

# Security Configuration
# WARNING: For production deployments, enable security and use proper SECRET_KEY:
#   - Set jwt_enabled: true for authentication
#   - Set SECRET_KEY environment variable to a secure random value
#   - Set CRAWL4AI_HOOKS_ENABLED=true only if you need hooks (RCE risk)
security:
  enabled: false
  jwt_enabled: false
  https_redirect: false
  trusted_hosts: ["*"]
  headers:
    x_content_type_options: "nosniff"
    x_frame_options: "DENY"
    content_security_policy: "default-src 'self'"
    strict_transport_security: "max-age=63072000; includeSubDomains"

# Crawler Configuration
crawler:
  base_config:
    simulate_user: true
  memory_threshold_percent: 95.0
  rate_limiter:
    enabled: true
    base_delay: [1.0, 2.0]
  timeouts:
    stream_init: 30.0  # Timeout for stream initialization
    batch_process: 300.0  # Timeout for batch processing
  pool:
    max_pages: 40                          # ← GLOBAL_SEM permits
    idle_ttl_sec: 300                     # ← 30 min janitor cutoff
  browser:
    kwargs:
      headless: true
      text_mode: true
    extra_args:
      # - "--single-process"
      - "--no-sandbox"
      - "--disable-dev-shm-usage"
      - "--disable-gpu"
      - "--disable-software-rasterizer"
      - "--disable-web-security"
      - "--allow-insecure-localhost"
      - "--ignore-certificate-errors"

# Logging Configuration
logging:
  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

# Observability Configuration
observability:
  prometheus:
    enabled: True
    endpoint: "/metrics"
  health_check:
    endpoint: "/health"

# Webhook Configuration
webhooks:
  enabled: true
  default_url: null  # Optional: default webhook URL for all jobs
  data_in_payload: false  # Optional: default behavior for including data
  retry:
    max_attempts: 5
    initial_delay_ms: 1000  # 1s, 2s, 4s, 8s, 16s exponential backoff
    max_delay_ms: 32000
    timeout_ms: 30000  # 30s timeout per webhook call
  headers:  # Optional: default headers to include
    User-Agent: "Crawl4AI-Webhook/1.0"