forgecrawl/ecosystem.config.cjs at main · ICJIA/forgecrawl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/**
 * ecosystem.config.cjs — PM2 process manager configuration for ForgeCrawl.
 *
 * This file is used for bare-metal deployments (non-Docker). It tells PM2
 * how to start, monitor, and restart the ForgeCrawl application.
 *
 * USAGE:
 *   pm2 start ecosystem.config.cjs       # Start the app
 *   pm2 restart forgecrawl               # Restart after code changes
 *   pm2 stop forgecrawl                  # Stop the app
 *   pm2 logs forgecrawl                  # View logs
 *   pm2 monit                            # Real-time monitoring dashboard
 *
 * FIRST-TIME SETUP:
 *   1. Build the app:  cd packages/app && pnpm build
 *   2. Copy secrets:   cp .env.example .env  (fill in NUXT_AUTH_SECRET)
 *   3. Create log dir: sudo mkdir -p /var/log/forgecrawl && sudo chown $USER /var/log/forgecrawl
 *   4. Start:          pm2 start ecosystem.config.cjs
 *   5. Auto-start:     pm2 save && pm2 startup
 *
 * IMPORTANT NOTES:
 *   - Public configuration defaults (ports, timeouts, concurrency, etc.) are
 *     defined in forgecrawl.config.ts and baked into the build at compile time.
 *     You do NOT need to duplicate them here as env vars.
 *   - Secrets (NUXT_AUTH_SECRET, NUXT_ENCRYPTION_KEY) are loaded from .env
 *     by Nuxt at runtime. Do not hardcode secrets in this file.
 *   - This file will evolve as new phases are completed. See the phase docs
 *     in docs/ for upcoming changes.
 *
 * WHY FORK MODE (NOT CLUSTER):
 *   Puppeteer shares a single browser instance across requests. Cluster mode
 *   would spawn multiple Node processes, each launching its own Chromium
 *   instance, quickly exhausting RAM. ForgeCrawl scales via Puppeteer's
 *   internal page concurrency (configured in forgecrawl.config.ts), not
 *   via multiple Node processes.
 */

module.exports = {
  apps: [
    {
      // ── App Identity ──────────────────────────────────────────────────
      /**
       * The name PM2 uses to identify this process.
       * Used in: pm2 restart forgecrawl, pm2 logs forgecrawl, etc.
       */
      name: 'forgecrawl',

      /**
       * Path to the built Nuxt server entry point.
       * This file is generated by `pnpm build` (or `nuxt build`) inside
       * packages/app/. The .output/ directory contains the compiled,
       * production-ready server.
       */
      script: 'packages/app/.output/server/index.mjs',

      // ── Process Mode ──────────────────────────────────────────────────
      /**
       * Number of process instances. Must be 1 for ForgeCrawl.
       * See "WHY FORK MODE" in the header comment above.
       */
      instances: 1,

      /**
       * Execution mode. 'fork' runs a single process (required).
       * Do NOT change to 'cluster' — Puppeteer will OOM.
       */
      exec_mode: 'fork',

      // ── Environment ───────────────────────────────────────────────────
      /**
       * Environment variables passed to the Node process.
       *
       * Only deployment-specific overrides go here. Public defaults are
       * baked into the build from forgecrawl.config.ts. Secrets are
       * loaded from .env by Nuxt automatically.
       *
       * You can override any NUXT_* variable here if needed for this
       * specific deployment, but prefer editing forgecrawl.config.ts
       * (and rebuilding) for non-secret values.
       */
      env: {
        NODE_ENV: 'production',

        /**
         * HTTP port. Must match your reverse proxy (Nginx) upstream.
         * Default is 5150 (set in forgecrawl.config.ts). Override here
         * only if you need a different port for this deployment.
         */
        PORT: 5150,

        /**
         * DO NOT put secrets here. They go in .env:
         *   NUXT_AUTH_SECRET=your-secret-here
         *   NUXT_ENCRYPTION_KEY=your-key-here
         *
         * Nuxt reads .env automatically when NODE_ENV=production and
         * the .env file is in the working directory (project root).
         */
      },

      // ── Memory Management ─────────────────────────────────────────────
      /**
       * Auto-restart if the process exceeds this memory limit.
       * Puppeteer/Chromium can leak memory over time, especially with
       * many concurrent scrapes. This is a safety net.
       *
       * Recommended values based on your VPS:
       *   2 GB RAM → '1G'   (leave room for OS + Chromium)
       *   4 GB RAM → '2500M' (comfortable headroom)
       *   8 GB RAM → '4G'   (generous for heavy crawling)
       *
       * If PM2 restarts the process frequently due to this limit,
       * either reduce puppeteer.concurrency in forgecrawl.config.ts
       * or upgrade your server.
       */
      max_memory_restart: '2500M',

      // ── Logging ───────────────────────────────────────────────────────
      /**
       * Timestamp format prepended to each log line.
       * Uses moment.js format tokens (PM2 built-in).
       */
      log_date_format: 'YYYY-MM-DD HH:mm:ss',

      /**
       * Log file paths. Create the directory first:
       *   sudo mkdir -p /var/log/forgecrawl
       *   sudo chown $USER /var/log/forgecrawl
       *
       * Alternatively, use PM2's default log location (~/.pm2/logs/)
       * by removing these two lines.
       */
      error_file: '/var/log/forgecrawl/error.log',
      out_file: '/var/log/forgecrawl/output.log',

      /**
       * Merge stdout and stderr into a single log file.
       * Set to true if you prefer unified logs. Default: false (separate files).
       */
      merge_logs: false,

      /**
       * Maximum log file size before rotation. PM2 will rotate logs
       * when they exceed this size (requires pm2-logrotate module).
       * Install: pm2 install pm2-logrotate
       */
      // max_size: '50M',

      // ── Restart Behavior ──────────────────────────────────────────────
      /**
       * Auto-restart on crash. Should always be true in production.
       */
      autorestart: true,

      /**
       * Delay between automatic restarts in milliseconds.
       * Prevents tight restart loops if the app crashes immediately on start.
       */
      restart_delay: 5000,

      /**
       * Maximum number of rapid restarts before PM2 stops trying.
       * If the app crashes more than this many times within min_uptime,
       * PM2 marks it as "errored" and stops restarting.
       * Set to 0 for unlimited restarts.
       */
      max_restarts: 15,

      /**
       * Minimum uptime (ms) to consider a start "successful".
       * If the process exits before this threshold, it counts as
       * an unstable restart toward max_restarts.
       */
      min_uptime: 10000,

      // ── Signals ───────────────────────────────────────────────────────
      /**
       * Graceful shutdown timeout in milliseconds.
       * When PM2 sends SIGINT, the app has this long to finish in-flight
       * requests and close the Puppeteer browser before PM2 sends SIGKILL.
       * 10 seconds is enough for most scrapes to complete.
       */
      kill_timeout: 10000,

      /**
       * Wait for the app to handle process.on('SIGINT') before killing.
       * Required for graceful Puppeteer browser cleanup.
       */
      wait_ready: false,

      // ── Watch (Development Only) ──────────────────────────────────────
      /**
       * File watching for auto-restart on changes. DISABLED in production.
       * For development, use `pnpm dev:app` instead of PM2.
       */
      watch: false,

      // ── Future Phases ─────────────────────────────────────────────────
      /**
       * TODO (Phase 3): When the job queue worker is split into a separate
       * process, add a second app entry here:
       *
       *   {
       *     name: 'forgecrawl-worker',
       *     script: 'packages/app/.output/server/worker.mjs',
       *     instances: 1,
       *     exec_mode: 'fork',
       *     env: { NODE_ENV: 'production' },
       *     max_memory_restart: '1G',
       *   }
       *
       * For now, the queue worker runs in-process with the main server.
       */
    },
  ],
}