From 7e0aa61dc1d994dd41744a2e1cd900cba7c72703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81ngel?= Date: Fri, 5 Jun 2026 21:58:51 -0400 Subject: [PATCH 1/3] fix(producer): restore 30s calibration timeout ceiling to prevent render hang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v0.6.74 change (Math.min → Math.max in createCaptureCalibrationConfig) raised the calibration protocol timeout from 30s to the default 300s. When a CDP call stalls during session init — page.goto, pollHfReady, or any page.evaluate — the 300s timeout makes the render appear to hang indefinitely at "Initializing calibration session...". Restore Math.min so calibration stays capped at 30s: if Chrome is stuck, fail fast and let the fallback path recover. Also add phase-level timing logs to initializeSession so the next report pinpoints which step stalls. Closes #1231 --- packages/engine/src/services/frameCapture.ts | 22 ++++++++++++++++++- .../src/services/render/captureCost.ts | 11 +++++----- .../src/services/renderOrchestrator.test.ts | 13 ++++++----- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/packages/engine/src/services/frameCapture.ts b/packages/engine/src/services/frameCapture.ts index dda707cb9..203226c70 100644 --- a/packages/engine/src/services/frameCapture.ts +++ b/packages/engine/src/services/frameCapture.ts @@ -748,17 +748,26 @@ export async function initializeSession(session: CaptureSession): Promise const url = `${serverUrl}/index.html`; const pageNavigationTimeout = session.config?.pageNavigationTimeout ?? DEFAULT_CONFIG.pageNavigationTimeout; + const initStart = Date.now(); + const logInitPhase = (phase: string) => { + console.log(`[initSession:${session.captureMode}] ${phase} (${Date.now() - initStart}ms)`); + }; + if (session.captureMode === "screenshot") { // Screenshot mode: standard navigation, rAF works normally await page.goto(url, { waitUntil: "domcontentloaded", timeout: pageNavigationTimeout }); + logInitPhase("page.goto complete"); const pageReadyTimeout = session.config?.playerReadyTimeout ?? DEFAULT_CONFIG.playerReadyTimeout; await pollHfReady(page, pageReadyTimeout); + logInitPhase("pollHfReady complete"); await pollSubCompositionTimelines(page, pageReadyTimeout); + logInitPhase("pollSubCompositionTimelines complete"); await applyVideoMetadataHints(page, session.options.videoMetadataHints); + logInitPhase("applyVideoMetadataHints complete"); // Wait for all video elements to have decoded their CURRENT frame, not // just metadata. readyState >= 2 (HAVE_CURRENT_DATA) means a frame is @@ -811,9 +820,12 @@ export async function initializeSession(session: CaptureSession): Promise ); } await decodeAllImages(page); + logInitPhase("images ready + decoded"); await page.evaluate(`document.fonts?.ready`); + logInitPhase("fonts ready"); await waitForOptionalTailwindReady(page, pageReadyTimeout); + logInitPhase("tailwind ready"); // For PNG captures, force the page background fully transparent so the // captured screenshots carry a real alpha channel. Must run AFTER @@ -879,22 +891,27 @@ export async function initializeSession(session: CaptureSession): Promise ); })(); warmupLoopPromise.catch(() => {}); + logInitPhase("warmup loop started"); await page.goto(url, { waitUntil: "domcontentloaded", timeout: pageNavigationTimeout }); + logInitPhase("page.goto complete"); // Poll for window.__hf readiness using manual evaluate loop (waitForFunction // uses rAF polling internally, which won't fire in beginFrame mode). const pageReadyTimeout = session.config?.playerReadyTimeout ?? DEFAULT_CONFIG.playerReadyTimeout; try { await pollHfReady(page, pageReadyTimeout); + logInitPhase("pollHfReady complete"); } catch (err) { warmupState.running = false; throw err; } await pollSubCompositionTimelines(page, pageReadyTimeout); + logInitPhase("pollSubCompositionTimelines complete"); await applyVideoMetadataHints(page, session.options.videoMetadataHints); + logInitPhase("applyVideoMetadataHints complete"); // Same readyState contract as the screenshot path above (>= 2 / HAVE_CURRENT_DATA). const bfVideosReady = await pollVideosReady( @@ -916,6 +933,7 @@ export async function initializeSession(session: CaptureSession): Promise `Continuing render — affected videos will appear as blank/black frames.`, ); } + logInitPhase("pollVideosReady complete"); // Image readiness — parity with pollVideosReady. Defense against remote // URLs that bypass the htmlCompiler localize step. @@ -938,10 +956,12 @@ export async function initializeSession(session: CaptureSession): Promise ); } await decodeAllImages(page); + logInitPhase("images ready + decoded"); - // Font check (no rAF dependency — uses fonts.ready API directly) await page.evaluate(`document.fonts?.ready`); + logInitPhase("fonts ready"); await waitForOptionalTailwindReady(page, pageReadyTimeout); + logInitPhase("tailwind ready"); // Stop warmup. Unlocked mode exits on this flag; locked mode keeps ticking // until LOCKED_WARMUP_TICKS, so we await its promise to ensure the count is diff --git a/packages/producer/src/services/render/captureCost.ts b/packages/producer/src/services/render/captureCost.ts index 1a1e4fd13..a6f0c7f78 100644 --- a/packages/producer/src/services/render/captureCost.ts +++ b/packages/producer/src/services/render/captureCost.ts @@ -53,11 +53,10 @@ export const CAPTURE_CALIBRATION_TARGET_MS = 600; export const MAX_MEASURED_CAPTURE_COST_MULTIPLIER = 8; /** - * CDP protocol timeout used while running calibration. Bounded below - * the normal `cfg.protocolTimeout` so a wedged BeginFrame calibration - * times out fast and falls back to screenshot mode (see the - * `shouldFallbackToScreenshotAfterCalibrationError` path in the - * sequencer). + * CDP protocol timeout used while running calibration. This is a ceiling, + * not a floor — a wedged BeginFrame must time out fast so the sequencer + * can fall back to screenshot mode via + * `shouldFallbackToScreenshotAfterCalibrationError`. */ export const CAPTURE_CALIBRATION_PROTOCOL_TIMEOUT_MS = 30_000; @@ -161,7 +160,7 @@ export function resolveRenderWorkerCount( export function createCaptureCalibrationConfig(cfg: EngineConfig): EngineConfig { return { ...cfg, - protocolTimeout: Math.max(cfg.protocolTimeout, CAPTURE_CALIBRATION_PROTOCOL_TIMEOUT_MS), + protocolTimeout: Math.min(cfg.protocolTimeout, CAPTURE_CALIBRATION_PROTOCOL_TIMEOUT_MS), }; } diff --git a/packages/producer/src/services/renderOrchestrator.test.ts b/packages/producer/src/services/renderOrchestrator.test.ts index 528b5b9c2..3816cfbf0 100644 --- a/packages/producer/src/services/renderOrchestrator.test.ts +++ b/packages/producer/src/services/renderOrchestrator.test.ts @@ -737,21 +737,22 @@ describe("selectCaptureCalibrationFrames", () => { }); describe("capture calibration safeguards", () => { - it("respects user protocol timeout when higher than calibration default", () => { + it("caps protocol timeout at calibration ceiling for fast fallback", () => { const cfg = createConfig(); const calibrationCfg = createCaptureCalibrationConfig(cfg); - // User's 300s timeout is higher than the 30s calibration default — use the user's value - expect(calibrationCfg.protocolTimeout).toBe(300000); + // Default 300s is above the 30s calibration ceiling — cap at 30s + // so a wedged BeginFrame times out fast and falls back to screenshot + expect(calibrationCfg.protocolTimeout).toBe(30000); expect(cfg.protocolTimeout).toBe(300000); }); - it("uses calibration floor when user timeout is lower", () => { + it("preserves user timeout when already below calibration ceiling", () => { const cfg = createConfig(); cfg.protocolTimeout = 5000; - // 5s is below the 30s calibration floor — use the floor - expect(createCaptureCalibrationConfig(cfg).protocolTimeout).toBe(30000); + // 5s is below the 30s ceiling — keep the user's value + expect(createCaptureCalibrationConfig(cfg).protocolTimeout).toBe(5000); }); it("falls back to screenshot mode after beginFrame calibration failures", () => { From fb233e75ed711ef3d3801a16d6537fe465054680 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81ngel?= Date: Fri, 5 Jun 2026 22:03:54 -0400 Subject: [PATCH 2/3] fix(producer): add render pipeline observability for faster triage Log the resolved environment at pipeline start (platform, arch, node version, all timeout values, GPU mode), the calibration config showing the actual timeout being used vs the parent, Chrome version and capture mode at browser launch, and a structured failure summary on error with stage timings and console errors. These four log categories give agents and users enough context to file actionable issues without needing to reproduce the problem. --- .../engine/src/services/browserManager.ts | 5 ++++ .../src/services/render/captureCost.ts | 6 ++++ .../src/services/renderOrchestrator.ts | 29 +++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/packages/engine/src/services/browserManager.ts b/packages/engine/src/services/browserManager.ts index 9ddec1720..8cba057a9 100644 --- a/packages/engine/src/services/browserManager.ts +++ b/packages/engine/src/services/browserManager.ts @@ -369,6 +369,11 @@ async function launchBrowser( protocolTimeout, }); + const browserVersion = await browser.version().catch(() => "unknown"); + console.log( + `[BrowserManager] Browser launched (${browserVersion}, ${captureMode}, headlessShell=${!!headlessShell}, platform=${process.platform})`, + ); + if (captureMode === "beginframe") { const supported = await probeBeginFrameSupport(browser).catch(() => true); if (!supported) { diff --git a/packages/producer/src/services/render/captureCost.ts b/packages/producer/src/services/render/captureCost.ts index a6f0c7f78..16bd1d6b0 100644 --- a/packages/producer/src/services/render/captureCost.ts +++ b/packages/producer/src/services/render/captureCost.ts @@ -361,6 +361,12 @@ export async function runCaptureCalibration(input: { }; const calibrationCfg = createCaptureCalibrationConfig({ ...cfg, forceScreenshot }); + log.info("[Render] Calibration config", { + protocolTimeout: calibrationCfg.protocolTimeout, + parentProtocolTimeout: cfg.protocolTimeout, + forceScreenshot, + totalFrames, + }); let calibration: | { estimate: CaptureCostEstimate; samples: CaptureCalibrationSample[] } | undefined; diff --git a/packages/producer/src/services/renderOrchestrator.ts b/packages/producer/src/services/renderOrchestrator.ts index 75c5b3bde..0b653d5cc 100644 --- a/packages/producer/src/services/renderOrchestrator.ts +++ b/packages/producer/src/services/renderOrchestrator.ts @@ -1495,6 +1495,22 @@ export async function executeRenderJob( job.startedAt = new Date(); assertNotAborted(); + + log.info("[Render] Pipeline started", { + platform: process.platform, + arch: process.arch, + nodeVersion: process.version, + fps: job.config.fps, + format: outputFormat, + quality: job.config.quality, + browserGpuMode: cfg.browserGpuMode, + forceScreenshot: cfg.forceScreenshot, + protocolTimeout: cfg.protocolTimeout, + browserTimeout: cfg.browserTimeout, + pageNavigationTimeout: cfg.pageNavigationTimeout, + playerReadyTimeout: cfg.playerReadyTimeout, + }); + if (!existsSync(workDir)) mkdirSync(workDir, { recursive: true }); if (job.config.debug) { @@ -2126,6 +2142,19 @@ export async function executeRenderJob( hdrDiagnostics, }); + log.info("[Render] Failure summary", { + failedStage: job.currentStage, + error: errorMessage, + elapsedMs: Date.now() - pipelineStart, + stageTimings: perfStages, + isTimeout: isTimeoutError, + workers: job.config.workers ?? "auto", + protocolTimeout: cfg.protocolTimeout, + browserConsoleErrors: lastBrowserConsole + .filter((l) => l.includes("ERROR") || l.includes("PAGEERROR")) + .slice(-5), + }); + await cleanupRenderResources({ fileServer, probeSession, From d74e7b6a78ec0b910dd7d4d2ea5e669f9658d723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81ngel?= Date: Fri, 5 Jun 2026 22:16:09 -0400 Subject: [PATCH 3/3] fix(engine): add missing pollVideosReady phase log in screenshot path The BeginFrame path logged this phase but the screenshot path didn't, creating an instrumentation gap when diagnosing hangs on macOS where screenshot mode is always used. --- packages/engine/src/services/frameCapture.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/engine/src/services/frameCapture.ts b/packages/engine/src/services/frameCapture.ts index 203226c70..d02ed650c 100644 --- a/packages/engine/src/services/frameCapture.ts +++ b/packages/engine/src/services/frameCapture.ts @@ -786,6 +786,7 @@ export async function initializeSession(session: CaptureSession): Promise session.options.skipReadinessVideoIds ?? [], pageReadyTimeout, ); + logInitPhase("pollVideosReady complete"); if (!videosReady) { const failedVideos = await page.evaluate((skipIdList: readonly string[]) => { const skip = new Set(skipIdList);