diff --git a/apps/api/native/Cargo.toml b/apps/api/native/Cargo.toml index f16908ac9..c4e96214d 100644 --- a/apps/api/native/Cargo.toml +++ b/apps/api/native/Cargo.toml @@ -29,10 +29,12 @@ zip = "5.0.0" calamine = "0.26" cfb = "0.10" tokio = "1.48.0" +tracing = "0.1" +tracing-subscriber = { version = "0.3", default-features = false, features = ["registry"] } [build-dependencies] napi-build = "2" [profile.release] lto = true -strip = "symbols" +strip = "debuginfo" diff --git a/apps/api/native/src/lib.rs b/apps/api/native/src/lib.rs index 731c4f2b4..f440be0cf 100644 --- a/apps/api/native/src/lib.rs +++ b/apps/api/native/src/lib.rs @@ -3,6 +3,7 @@ pub use crate::crawler::*; pub use crate::engpicker::*; pub use crate::html::*; +pub use crate::logging::*; pub use crate::pdf::*; pub use crate::utils::*; @@ -12,6 +13,7 @@ mod crawler; mod document; mod engpicker; mod html; +mod logging; mod pdf; mod utils; diff --git a/apps/api/native/src/logging.rs b/apps/api/native/src/logging.rs new file mode 100644 index 000000000..bf444e895 --- /dev/null +++ b/apps/api/native/src/logging.rs @@ -0,0 +1,269 @@ +use napi_derive::napi; +use serde::Serialize; +use serde_json::Value; +use std::sync::{Arc, Mutex}; +use tracing::field::{Field, Visit}; +use tracing::Level; +use tracing_subscriber::layer::SubscriberExt; +use tracing_subscriber::Layer; + +/// Context passed from TypeScript to continue the trace. +#[derive(Clone)] +#[napi(object)] +pub struct NativeContext { + pub scrape_id: String, + pub url: String, +} + +/// A single log entry captured during Rust execution. +#[derive(Clone, Debug, Serialize)] +#[napi(object)] +pub struct NativeLogEntry { + pub level: String, + pub target: String, + pub message: String, + pub fields: Value, + pub timestamp_ms: f64, +} + +struct LogCollector { + logs: Arc>>, +} + +struct FieldVisitor { + fields: serde_json::Map, + message: Option, +} + +impl Visit for FieldVisitor { + fn record_debug(&mut self, field: &Field, value: &dyn std::fmt::Debug) { + if field.name() == "message" { + self.message = Some(format!("{:?}", value)); + } else { + self + .fields + .insert(field.name().to_string(), Value::String(format!("{:?}", value))); + } + } + + fn record_str(&mut self, field: &Field, value: &str) { + if field.name() == "message" { + self.message = Some(value.to_string()); + } else { + self + .fields + .insert(field.name().to_string(), Value::String(value.to_string())); + } + } + + fn record_i64(&mut self, field: &Field, value: i64) { + self + .fields + .insert(field.name().to_string(), Value::Number(value.into())); + } + + fn record_u64(&mut self, field: &Field, value: u64) { + self + .fields + .insert(field.name().to_string(), Value::Number(value.into())); + } + + fn record_f64(&mut self, field: &Field, value: f64) { + if let Some(n) = serde_json::Number::from_f64(value) { + self + .fields + .insert(field.name().to_string(), Value::Number(n)); + } + } + + fn record_bool(&mut self, field: &Field, value: bool) { + self + .fields + .insert(field.name().to_string(), Value::Bool(value)); + } +} + +impl Layer for LogCollector { + fn on_event(&self, event: &tracing::Event<'_>, _ctx: tracing_subscriber::layer::Context<'_, S>) { + let mut visitor = FieldVisitor { + fields: serde_json::Map::new(), + message: None, + }; + event.record(&mut visitor); + + let level = match *event.metadata().level() { + Level::ERROR => "error", + Level::WARN => "warn", + Level::INFO => "info", + Level::DEBUG => "debug", + Level::TRACE => "trace", + }; + + let entry = NativeLogEntry { + level: level.to_string(), + target: event.metadata().target().to_string(), + message: visitor.message.unwrap_or_default(), + fields: Value::Object(visitor.fields), + timestamp_ms: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs_f64() * 1000.0) + .unwrap_or(0.0), + }; + + if let Ok(mut logs) = self.logs.lock() { + logs.push(entry); + } + } +} + +#[derive(Debug)] +pub struct TracingResult { + pub value: T, + pub logs: Vec, +} + +/// Run a closure with tracing enabled, capturing all log events. +/// Wraps the closure in `catch_unwind` for panic safety. +/// +/// Returns `TracingResult>` so that logs are **always** +/// available — even when the closure returns `Err` or panics. +pub fn with_native_tracing( + ctx: Option<&NativeContext>, + module: &str, + f: F, +) -> TracingResult> +where + F: FnOnce() -> napi::Result, +{ + let logs = Arc::new(Mutex::new(Vec::new())); + let collector = LogCollector { logs: logs.clone() }; + let subscriber = tracing_subscriber::Registry::default().with(collector); + + let result = tracing::subscriber::with_default(subscriber, || { + let _span = match ctx { + Some(c) => tracing::info_span!( + "native", + scrape_id = %c.scrape_id, + url = %c.url, + module = %module, + ) + .entered(), + None => tracing::info_span!("native", module = %module).entered(), + }; + + match std::panic::catch_unwind(std::panic::AssertUnwindSafe(f)) { + Ok(result) => result, + Err(panic_info) => { + let msg = if let Some(s) = panic_info.downcast_ref::<&str>() { + s.to_string() + } else if let Some(s) = panic_info.downcast_ref::() { + s.clone() + } else { + "unknown panic".to_string() + }; + let backtrace = std::backtrace::Backtrace::force_capture(); + tracing::error!( + panic = true, + backtrace = %backtrace, + "native panic in {}: {}", module, msg, + ); + + Err(napi::Error::new( + napi::Status::GenericFailure, + format!("Rust panic in {module}: {msg}\nBacktrace:\n{backtrace}"), + )) + } + } + }); + + let collected = logs.lock().map(|l| l.clone()).unwrap_or_default(); + + TracingResult { + value: result, + logs: collected, + } +} + +/// Append serialized logs to a NAPI error so they survive the FFI boundary. +/// The TS side can extract them from `error.message` via `extractNativeLogs`. +pub fn embed_logs_in_error(err: napi::Error, logs: &[NativeLogEntry]) -> napi::Error { + if logs.is_empty() { + return err; + } + if let Ok(logs_json) = serde_json::to_string(logs) { + napi::Error::new( + err.status, + format!("{}\n__native_logs__:{logs_json}", err.reason), + ) + } else { + err + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_collects_logs() { + let traced = with_native_tracing(None, "test", || { + tracing::info!("hello from rust"); + Ok(42) + }); + + let value = traced.value.unwrap(); + assert_eq!(value, 42); + assert_eq!(traced.logs.len(), 1); + assert_eq!(traced.logs[0].level, "info"); + assert!(traced.logs[0].message.contains("hello from rust")); + } + + #[test] + fn test_with_context() { + let ctx = NativeContext { + scrape_id: "test-123".to_string(), + url: "https://example.com".to_string(), + }; + let traced = with_native_tracing(Some(&ctx), "pdf", || { + tracing::warn!("something odd"); + Ok("ok") + }); + + assert_eq!(traced.value.unwrap(), "ok"); + assert_eq!(traced.logs.len(), 1); + assert_eq!(traced.logs[0].level, "warn"); + } + + #[test] + fn test_captures_panic_with_logs() { + let traced: TracingResult> = with_native_tracing(None, "test", || { + panic!("test panic"); + }); + + assert!(traced.value.is_err()); + let err = traced.value.unwrap_err(); + assert!(err.reason.contains("test panic")); + assert!(err.reason.contains("Backtrace")); + // Panic log is preserved even though the closure failed + assert!(!traced.logs.is_empty()); + assert_eq!(traced.logs[0].level, "error"); + assert!(traced.logs[0].message.contains("test panic")); + } + + #[test] + fn test_error_preserves_logs() { + let traced: TracingResult> = with_native_tracing(None, "test", || { + tracing::info!("before error"); + Err(napi::Error::new( + napi::Status::GenericFailure, + "test error", + )) + }); + + assert!(traced.value.is_err()); + // Logs are preserved even on error paths + assert_eq!(traced.logs.len(), 1); + assert_eq!(traced.logs[0].level, "info"); + assert!(traced.logs[0].message.contains("before error")); + } +} diff --git a/apps/api/native/src/pdf.rs b/apps/api/native/src/pdf.rs index 412821b00..f97fc2743 100644 --- a/apps/api/native/src/pdf.rs +++ b/apps/api/native/src/pdf.rs @@ -1,9 +1,8 @@ use napi::bindgen_prelude::*; use napi_derive::napi; -use pdf_inspector::{ - PdfOptions, PdfType, - process_pdf_with_options as rust_process_pdf, -}; +use pdf_inspector::{PdfOptions, PdfType, process_pdf_with_options as rust_process_pdf}; + +use crate::logging::{embed_logs_in_error, with_native_tracing, NativeContext, NativeLogEntry}; #[napi(object)] pub struct PdfProcessResult { @@ -15,6 +14,7 @@ pub struct PdfProcessResult { pub title: Option, pub confidence: f64, pub is_complex: bool, + pub logs: Vec, } fn pdf_type_str(t: PdfType) -> &'static str { @@ -36,38 +36,83 @@ fn to_napi_result(result: pdf_inspector::PdfProcessResult) -> PdfProcessResult { title: result.title, confidence: result.confidence as f64, is_complex: result.layout.is_complex, + logs: Vec::new(), } } /// Process a PDF file: detect type, extract text + markdown if text-based. /// When `max_pages` is provided, only the first N pages are extracted. +/// Pass `ctx` (NativeContext) for structured tracing with scrape_id/url. #[napi] -pub fn process_pdf(path: String, max_pages: Option) -> Result { - let opts = match max_pages { - Some(n) if n > 0 => PdfOptions::new().pages(1..=n), - _ => PdfOptions::new(), - }; +pub fn process_pdf( + path: String, + max_pages: Option, + ctx: Option, +) -> Result { + let traced = with_native_tracing(ctx.as_ref(), "pdf", || { + tracing::info!(max_pages = ?max_pages, "starting PDF processing"); + + let opts = match max_pages { + Some(n) if n > 0 => PdfOptions::new().pages(1..=n), + _ => PdfOptions::new(), + }; + + let result = rust_process_pdf(&path, opts).map_err(|e| { + tracing::error!(error = %e, "PDF processing failed"); + Error::new(Status::GenericFailure, format!("Failed to process PDF: {e}")) + })?; - let result = rust_process_pdf(&path, opts).map_err(|e| { - Error::new( - Status::GenericFailure, - format!("Failed to process PDF: {e}"), - ) - })?; + tracing::info!( + pdf_type = pdf_type_str(result.pdf_type), + page_count = result.page_count, + confidence = %result.confidence, + is_complex = result.layout.is_complex, + "PDF processing complete" + ); - Ok(to_napi_result(result)) + Ok(to_napi_result(result)) + }); + + match traced.value { + Ok(mut result) => { + result.logs = traced.logs; + Ok(result) + } + Err(err) => Err(embed_logs_in_error(err, &traced.logs)), + } } /// Fast metadata-only detection: page count, title, type, confidence. /// Skips text extraction, markdown generation, and layout analysis. +/// Pass `ctx` (NativeContext) for structured tracing with scrape_id/url. #[napi] -pub fn detect_pdf(path: String) -> Result { - let result = rust_process_pdf(&path, PdfOptions::detect_only()).map_err(|e| { - Error::new( - Status::GenericFailure, - format!("Failed to detect PDF: {e}"), - ) - })?; +pub fn detect_pdf( + path: String, + ctx: Option, +) -> Result { + let traced = with_native_tracing(ctx.as_ref(), "pdf", || { + tracing::info!("starting PDF detection"); - Ok(to_napi_result(result)) + let result = rust_process_pdf(&path, PdfOptions::detect_only()).map_err(|e| { + tracing::error!(error = %e, "PDF detection failed"); + Error::new(Status::GenericFailure, format!("Failed to detect PDF: {e}")) + })?; + + tracing::info!( + pdf_type = pdf_type_str(result.pdf_type), + page_count = result.page_count, + confidence = %result.confidence, + "PDF detection complete" + ); + + Ok(to_napi_result(result)) + }); + + match traced.value { + Ok(mut result) => { + result.logs = traced.logs; + Ok(result) + } + Err(err) => Err(embed_logs_in_error(err, &traced.logs)), + } } diff --git a/apps/api/src/__tests__/snips/v2/scrape.test.ts b/apps/api/src/__tests__/snips/v2/scrape.test.ts index d3c2c9601..16e537629 100644 --- a/apps/api/src/__tests__/snips/v2/scrape.test.ts +++ b/apps/api/src/__tests__/snips/v2/scrape.test.ts @@ -27,8 +27,10 @@ import { } from "./lib"; import request from "./lib"; import crypto from "crypto"; +import { z } from "zod"; const CHANGE_TRACKING_TEST_URL = `${TEST_SUITE_WEBSITE}?testId=${crypto.randomUUID()}`; +const stringbool = z.stringbool().catch(false); let identity: Identity; @@ -52,6 +54,20 @@ beforeAll(async () => { describe("Scrape tests", () => { const base = TEST_SUITE_WEBSITE; + const playwrightAllowsLocalTargets = stringbool.parse( + process.env.ALLOW_LOCAL_WEBHOOKS, + ); + const createSelfHostedLocalUrl = () => { + const target = new URL(TEST_SUITE_WEBSITE); + target.searchParams.set("testId", crypto.randomUUID()); + return target.toString(); + }; + + const createDnsResolvedLocalUrl = () => { + const target = new URL(createSelfHostedLocalUrl()); + target.hostname = "localtest.me"; + return target.toString(); + }; concurrentIf(ALLOW_TEST_SUITE_WEBSITE)( "works", @@ -279,6 +295,50 @@ describe("Scrape tests", () => { scrapeTimeout, ); + concurrentIf( + TEST_SELF_HOST && + HAS_PLAYWRIGHT && + ALLOW_TEST_SUITE_WEBSITE && + playwrightAllowsLocalTargets, + )( + "playwright allows local-network targets when ALLOW_LOCAL_WEBHOOKS is enabled", + async () => { + const response = await scrape( + { + url: createSelfHostedLocalUrl(), + waitFor: 100, + }, + identity, + ); + + expect(response.markdown).toContain("Firecrawl"); + }, + scrapeTimeout, + ); + + concurrentIf( + TEST_SELF_HOST && HAS_PLAYWRIGHT && !playwrightAllowsLocalTargets, + )( + "playwright blocks local-network targets resolved via DNS", + async () => { + const raw = await scrapeRaw( + { + url: createDnsResolvedLocalUrl(), + waitFor: 100, + }, + identity, + ); + + expect(raw.statusCode).toBe(200); + expect(raw.body.success).toBe(true); + expect(raw.body.data?.metadata?.statusCode).toBe(403); + expect(raw.body.data?.metadata?.error).toContain( + "Blocked insecure target URL", + ); + }, + scrapeTimeout, + ); + concurrentIf(TEST_PRODUCTION || (HAS_PLAYWRIGHT && ALLOW_TEST_SUITE_WEBSITE))( "waitFor works", async () => { diff --git a/apps/api/src/controllers/v2/types.ts b/apps/api/src/controllers/v2/types.ts index a0e921eb5..30332c56d 100644 --- a/apps/api/src/controllers/v2/types.ts +++ b/apps/api/src/controllers/v2/types.ts @@ -405,7 +405,7 @@ const queryFormatWithOptions = z.strictObject({ prompt: z.string().max(10000), }); -export type QueryFormatWithOptions = z.output; +type QueryFormatWithOptions = z.output; export type FormatObject = | { type: "markdown" } diff --git a/apps/api/src/lib/native-logging.ts b/apps/api/src/lib/native-logging.ts new file mode 100644 index 000000000..a643ab313 --- /dev/null +++ b/apps/api/src/lib/native-logging.ts @@ -0,0 +1,77 @@ +import type { Logger } from "winston"; + +const NATIVE_LOGS_SEPARATOR = "\n__native_logs__:"; + +/** Matches the NativeLogEntry struct from Rust (@mendable/firecrawl-rs). */ +export interface NativeLogEntry { + level: string; + target: string; + message: string; + fields: Record; + timestampMs: number; +} + +/** + * Extract native logs embedded in a NAPI error message by `embed_logs_in_error`. + * Emits them through the logger and returns the cleaned error message. + */ +export function extractAndEmitNativeLogs( + error: unknown, + parentLogger: Logger, + module: string, +): void { + if (!(error instanceof Error)) return; + const idx = error.message.indexOf(NATIVE_LOGS_SEPARATOR); + if (idx === -1) return; + + const logsJson = error.message.slice(idx + NATIVE_LOGS_SEPARATOR.length); + + try { + const logs: NativeLogEntry[] = JSON.parse(logsJson); + // Only strip after successful parse so we don't lose data on failure + error.message = error.message.slice(0, idx); + emitNativeLogs(logs, parentLogger, module); + } catch { + // JSON parse failed — leave the original error message intact + } +} + +/** + * Emit log entries captured inside the Rust native module through a Winston + * logger, preserving trace context (scrape_id / url via the parent logger) + * and adding `source: "native"` + the Rust module name as labels. + */ +export function emitNativeLogs( + logs: NativeLogEntry[] | undefined, + parentLogger: Logger, + module: string, +): void { + if (!logs || logs.length === 0) return; + + const childLogger = parentLogger.child({ source: "native", module }); + + for (const entry of logs) { + const meta = { + rustTarget: entry.target, + ...entry.fields, + }; + + switch (entry.level) { + case "error": + childLogger.error(entry.message, meta); + break; + case "warn": + childLogger.warn(entry.message, meta); + break; + case "info": + childLogger.info(entry.message, meta); + break; + case "debug": + case "trace": + childLogger.debug(entry.message, meta); + break; + default: + childLogger.info(entry.message, meta); + } + } +} diff --git a/apps/api/src/scraper/WebScraper/utils/ENGINE_FORCING.md b/apps/api/src/scraper/WebScraper/utils/ENGINE_FORCING.md index 5dfcbf155..978ca0791 100644 --- a/apps/api/src/scraper/WebScraper/utils/ENGINE_FORCING.md +++ b/apps/api/src/scraper/WebScraper/utils/ENGINE_FORCING.md @@ -27,10 +27,8 @@ The engine forcing is configured via the `FORCED_ENGINE_DOMAINS` environment var ### Available Engines - `fire-engine;chrome-cdp` - Advanced browser with Chrome DevTools Protocol -- `fire-engine;playwright` - Playwright-based browser automation - `fire-engine;tlsclient` - TLS fingerprinting for anti-bot bypass - `fire-engine;chrome-cdp;stealth` - Chrome CDP with stealth mode -- `fire-engine;playwright;stealth` - Playwright with stealth mode - `fire-engine;tlsclient;stealth` - TLS client with stealth mode - `playwright` - Direct Playwright integration - `fetch` - Simple HTTP requests diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index ca486e842..e062bd734 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -36,10 +36,6 @@ const successSchema = z.object({ // timeTakenCookie: z.number().optional(), // timeTakenRequest: z.number().optional(), - // legacy: playwright only - screenshot: z.string().optional(), - - // new: actions screenshots: z.string().array().optional(), actionContent: z .object({ diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 09d1a967d..e1652dcbf 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -6,7 +6,6 @@ import { fireEngineStagingURL, FireEngineScrapeRequestChromeCDP, FireEngineScrapeRequestCommon, - FireEngineScrapeRequestPlaywright, FireEngineScrapeRequestTLSClient, } from "./scrape"; import { EngineScrapeResult } from ".."; @@ -48,7 +47,6 @@ const BRANDING_DEFAULT_WAIT_MS = 2000; async function performFireEngineScrape< Engine extends | FireEngineScrapeRequestChromeCDP - | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient, >( meta: Meta, @@ -244,7 +242,9 @@ async function performFireEngineScrape< "fire-engine.duration_ms": Date.now() - startTime, "fire-engine.status_code": status.pageStatusCode, "fire-engine.content_length": status.content?.length, - "fire-engine.has_screenshot": !!status.screenshot, + "fire-engine.has_screenshot": !!( + status.screenshots && status.screenshots.length > 0 + ), "fire-engine.has_pdf": !!(status as any).pdf, "fire-engine.job_id": (scrape as any).jobId, }); @@ -367,19 +367,12 @@ export async function scrapeURLWithFireEngineChromeCDP( true, ); + let screenshot: string | undefined; if (hasFormatOfType(meta.options.formats, "screenshot")) { - // meta.logger.debug( - // "Transforming screenshots from actions into screenshot field", - // { screenshots: response.screenshots }, - // ); - if (response.screenshots) { - response.screenshot = response.screenshots.slice(-1)[0]; + if (response.screenshots && response.screenshots.length > 0) { + screenshot = response.screenshots.slice(-1)[0]; response.screenshots = response.screenshots.slice(0, -1); } - // meta.logger.debug("Screenshot transformation done", { - // screenshots: response.screenshots, - // screenshot: response.screenshot, - // }); } if (!response.url) { @@ -435,7 +428,7 @@ export async function scrapeURLWithFireEngineChromeCDP( x => x[0].toLowerCase() === "content-type", ) ?? [])[1] ?? undefined, - screenshot: response.screenshot, + screenshot, ...(actions.length > 0 ? { actions: { @@ -456,84 +449,6 @@ export async function scrapeURLWithFireEngineChromeCDP( }); } -export async function scrapeURLWithFireEnginePlaywright( - meta: Meta, -): Promise { - return withSpan("engine.fire-engine.playwright", async span => { - setSpanAttributes(span, { - "engine.type": "fire-engine-playwright", - "engine.url": meta.url, - "engine.team_id": meta.internalOptions.teamId, - }); - const totalWait = meta.options.waitFor; - - const request: FireEngineScrapeRequestCommon & - FireEngineScrapeRequestPlaywright = { - url: meta.rewrittenUrl ?? meta.url, - scrapeId: meta.id, - engine: "playwright", - instantReturn: false, - - headers: meta.options.headers, - priority: meta.internalOptions.priority, - screenshot: - hasFormatOfType(meta.options.formats, "screenshot") !== undefined, - fullPageScreenshot: hasFormatOfType(meta.options.formats, "screenshot") - ?.fullPage, - wait: meta.options.waitFor, - geolocation: meta.options.location, - blockAds: meta.options.blockAds, - mobileProxy: meta.featureFlags.has("stealthProxy"), - - timeout: meta.abort.scrapeTimeout() ?? 300000, - saveScrapeResultToGCS: - !meta.internalOptions.zeroDataRetention && - meta.internalOptions.saveScrapeResultToGCS, - zeroDataRetention: meta.internalOptions.zeroDataRetention, - }; - - let response = await performFireEngineScrape( - meta, - meta.logger.child({ - method: "scrapeURLWithFireEnginePlaywright/callFireEngine", - request, - }), - request, - meta.mock, - meta.abort.asSignal(), - ); - - if (!response.url) { - meta.logger.warn("Fire-engine did not return the response's URL", { - response, - sourceURL: meta.url, - }); - } - - return { - url: response.url ?? meta.url, - - html: response.content, - error: response.pageError, - statusCode: response.pageStatusCode, - - contentType: - (Object.entries(response.responseHeaders ?? {}).find( - x => x[0].toLowerCase() === "content-type", - ) ?? [])[1] ?? undefined, - - ...(response.screenshots !== undefined && response.screenshots.length > 0 - ? { - screenshot: response.screenshots[0], - } - : {}), - - proxyUsed: response.usedMobileProxy ? "stealth" : "basic", - timezone: response.timezone, - }; - }); -} - export async function scrapeURLWithFireEngineTLSClient( meta: Meta, ): Promise { @@ -603,7 +518,7 @@ export async function scrapeURLWithFireEngineTLSClient( export function fireEngineMaxReasonableTime( meta: Meta, - engine: "chrome-cdp" | "playwright" | "tlsclient", + engine: "chrome-cdp" | "tlsclient", ): number { const hasBranding = hasFormatOfType(meta.options.formats, "branding"); const defaultWait = hasBranding ? BRANDING_DEFAULT_WAIT_MS : 0; @@ -614,8 +529,6 @@ export function fireEngineMaxReasonableTime( if (engine === "tlsclient") { return 15000; - } else if (engine === "playwright") { - return (meta.options.waitFor ?? 0) + 30000; } else { return ( effectiveWait + diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts index e100c7421..c7d80ce01 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts @@ -55,17 +55,6 @@ export type FireEngineScrapeRequestChromeCDP = { disableSmartWaitCache?: boolean; }; -export type FireEngineScrapeRequestPlaywright = { - engine: "playwright"; - blockAds?: boolean; // default: true - - // mutually exclusive, default: false - screenshot?: boolean; - fullPageScreenshot?: boolean; - - wait?: number; // default: 0 -}; - export type FireEngineScrapeRequestTLSClient = { engine: "tlsclient"; atsv?: boolean; // v0 only, default: false @@ -88,10 +77,6 @@ const successSchema = z.object({ // timeTakenCookie: z.number().optional(), // timeTakenRequest: z.number().optional(), - // legacy: playwright only - screenshot: z.string().optional(), - - // new: actions screenshots: z.string().array().optional(), actionContent: z .object({ @@ -176,7 +161,6 @@ export const fireEngineStagingURL = export async function fireEngineScrape< Engine extends | FireEngineScrapeRequestChromeCDP - | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient, >( meta: Meta, diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 97765f6b3..3524f403d 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -5,7 +5,6 @@ import { documentMaxReasonableTime, scrapeDocument } from "./document"; import { fireEngineMaxReasonableTime, scrapeURLWithFireEngineChromeCDP, - scrapeURLWithFireEnginePlaywright, scrapeURLWithFireEngineTLSClient, } from "./fire-engine"; import { pdfMaxReasonableTime, scrapePDF } from "./pdf"; @@ -32,8 +31,6 @@ export type Engine = | "fire-engine(retry);chrome-cdp" | "fire-engine;chrome-cdp;stealth" | "fire-engine(retry);chrome-cdp;stealth" - | "fire-engine;playwright" - | "fire-engine;playwright;stealth" | "fire-engine;tlsclient" | "fire-engine;tlsclient;stealth" | "playwright" @@ -65,8 +62,6 @@ const engines: Engine[] = [ "fire-engine;chrome-cdp;stealth" as const, "fire-engine(retry);chrome-cdp" as const, "fire-engine(retry);chrome-cdp;stealth" as const, - // "fire-engine;playwright" as const, - // "fire-engine;playwright;stealth" as const, "fire-engine;tlsclient" as const, "fire-engine;tlsclient;stealth" as const, ] @@ -162,8 +157,6 @@ const engineHandlers: { "fire-engine(retry);chrome-cdp": scrapeURLWithFireEngineChromeCDP, "fire-engine;chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP, "fire-engine(retry);chrome-cdp;stealth": scrapeURLWithFireEngineChromeCDP, - "fire-engine;playwright": scrapeURLWithFireEnginePlaywright, - "fire-engine;playwright;stealth": scrapeURLWithFireEnginePlaywright, "fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient, "fire-engine;tlsclient;stealth": scrapeURLWithFireEngineTLSClient, playwright: scrapeURLWithPlaywright, @@ -186,10 +179,6 @@ const engineMRTs: { fireEngineMaxReasonableTime(meta, "chrome-cdp"), "fire-engine(retry);chrome-cdp;stealth": meta => fireEngineMaxReasonableTime(meta, "chrome-cdp"), - "fire-engine;playwright": meta => - fireEngineMaxReasonableTime(meta, "playwright"), - "fire-engine;playwright;stealth": meta => - fireEngineMaxReasonableTime(meta, "playwright"), "fire-engine;tlsclient": meta => fireEngineMaxReasonableTime(meta, "tlsclient"), "fire-engine;tlsclient;stealth": meta => @@ -325,44 +314,6 @@ const engineOptions: { }, quality: -5, }, - "fire-engine;playwright": { - features: { - actions: false, - waitFor: true, - screenshot: true, - "screenshot@fullScreen": true, - pdf: false, - document: false, - atsv: false, - location: false, - mobile: false, - skipTlsVerification: false, - useFastMode: false, - stealthProxy: false, - branding: false, - disableAdblock: true, - }, - quality: 40, - }, - "fire-engine;playwright;stealth": { - features: { - actions: false, - waitFor: true, - screenshot: true, - "screenshot@fullScreen": true, - pdf: false, - document: false, - atsv: false, - location: false, - mobile: false, - skipTlsVerification: false, - useFastMode: false, - stealthProxy: true, - branding: false, - disableAdblock: true, - }, - quality: -10, - }, playwright: { features: { actions: false, @@ -543,9 +494,7 @@ export async function buildFallbackList(meta: Meta): Promise< "fire-engine(retry);chrome-cdp", "fire-engine;chrome-cdp;stealth", "fire-engine(retry);chrome-cdp;stealth", - "fire-engine;playwright", // "fire-engine;tlsclient", - // "fire-engine;playwright;stealth", // "fire-engine;tlsclient;stealth", ] as Engine[]) : []), diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index caf1d8170..9561f3029 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -23,6 +23,8 @@ import type { PDFMode } from "../../../../controllers/v2/types"; import { processPdf, detectPdf } from "@mendable/firecrawl-rs"; import { MAX_FILE_SIZE, MILLISECONDS_PER_PAGE } from "./types"; import type { PDFProcessorResult } from "./types"; +import { emitNativeLogs, extractAndEmitNativeLogs } from "../../../../lib/native-logging"; +import { withSpan, setSpanAttributes } from "../../../../lib/otel-tracer"; import { scrapePDFWithRunPodMU } from "./runpodMU"; import { scrapePDFWithParsePDF } from "./pdfParse"; import { captureExceptionWithZdrCheck } from "../../../../services/sentry"; @@ -155,8 +157,18 @@ export async function scrapePDF(meta: Meta): Promise { // When PDF_RUST_EXTRACT_ENABLE is off this is the only path taken, // matching current prod behaviour (detectPdf → MinerU → pdfParse). try { + const nativeCtx = { scrapeId: meta.id, url: meta.rewrittenUrl ?? meta.url }; const startedAt = Date.now(); - const detection = detectPdf(tempFilePath); + const detection = await withSpan("native.pdf.detect", async (span) => { + const result = detectPdf(tempFilePath, nativeCtx); + setSpanAttributes(span, { + "native.module": "pdf", + "native.pdf_type": result.pdfType, + "native.page_count": result.pageCount, + }); + emitNativeLogs(result.logs, meta.logger, "pdf.detect"); + return result; + }); const durationMs = Date.now() - startedAt; logger.info("detectPdf completed", { @@ -173,6 +185,7 @@ export async function scrapePDF(meta: Meta): Promise { : detection.pageCount; metadataTitle = detection.title ?? undefined; } catch (error) { + extractAndEmitNativeLogs(error, meta.logger, "pdf.detect"); logger.warn("detectPdf failed", { error, url: meta.rewrittenUrl ?? meta.url, @@ -189,8 +202,20 @@ export async function scrapePDF(meta: Meta): Promise { } else { // Rust extraction enabled (fast / auto modes). try { + const nativeCtx = { scrapeId: meta.id, url: meta.rewrittenUrl ?? meta.url }; const startedAt = Date.now(); - const pdfResult = processPdf(tempFilePath, maxPages ?? undefined); + const pdfResult = await withSpan("native.pdf.process", async (span) => { + const result = processPdf(tempFilePath, maxPages ?? undefined, nativeCtx); + setSpanAttributes(span, { + "native.module": "pdf", + "native.pdf_type": result.pdfType, + "native.page_count": result.pageCount, + "native.confidence": result.confidence, + "native.is_complex": result.isComplex, + }); + emitNativeLogs(result.logs, meta.logger, "pdf.process"); + return result; + }); const durationMs = Date.now() - startedAt; logger.info("processPdf completed", { @@ -258,6 +283,7 @@ export async function scrapePDF(meta: Meta): Promise { if (error instanceof PDFOCRRequiredError) { throw error; } + extractAndEmitNativeLogs(error, meta.logger, "pdf.process"); logger.warn("processPdf failed, falling back to MU/PdfParse", { error, url: meta.rewrittenUrl ?? meta.url, diff --git a/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts b/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts index fca2526e2..143d34cf0 100644 --- a/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts +++ b/apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts @@ -32,14 +32,6 @@ export const urlSpecificParams: Record = { // }, // }, // }, - // "notion.com": { - // scrapeOptions: { waitFor: 2000 }, - // internalOptions: { forceEngine: "fire-engine;playwright" } - // }, - // "developer.apple.com": { - // scrapeOptions: { waitFor: 2000 }, - // internalOptions: { forceEngine: "fire-engine;playwright" } - // }, "digikey.com": { scrapeOptions: {}, internalOptions: { forceEngine: "fire-engine;tlsclient" }, diff --git a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts index f183df28e..19ddba04e 100644 --- a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts +++ b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts @@ -22,7 +22,6 @@ import { parseMarkdown } from "../../lib/html-to-markdown"; const testEngines: (Engine | undefined)[] = [ undefined, "fire-engine;chrome-cdp", - "fire-engine;playwright", "fire-engine;tlsclient", "fetch", ]; @@ -30,7 +29,6 @@ const testEngines: (Engine | undefined)[] = [ const testEnginesScreenshot: (Engine | undefined)[] = [ undefined, "fire-engine;chrome-cdp", - "fire-engine;playwright", ]; describe("Standalone scrapeURL tests", () => { diff --git a/apps/api/src/services/ab-test.ts b/apps/api/src/services/ab-test.ts index ae5c302ca..4efee3390 100644 --- a/apps/api/src/services/ab-test.ts +++ b/apps/api/src/services/ab-test.ts @@ -5,7 +5,6 @@ import { config } from "../config"; import { FireEngineScrapeRequestChromeCDP, FireEngineScrapeRequestCommon, - FireEngineScrapeRequestPlaywright, FireEngineScrapeRequestTLSClient, } from "../scraper/scrapeURL/engines/fire-engine/scrape"; import { getDocFromGCS } from "../lib/gcs-jobs"; @@ -80,11 +79,7 @@ type ABTestDecision = export function abTestFireEngine( feRequest: FireEngineScrapeRequestCommon & - ( - | FireEngineScrapeRequestChromeCDP - | FireEngineScrapeRequestPlaywright - | FireEngineScrapeRequestTLSClient - ), + (FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestTLSClient), ): ABTestDecision { const abLogger = _logger.child({ method: "ABTestFireEngine" }); diff --git a/apps/playwright-service-ts/api.ts b/apps/playwright-service-ts/api.ts index 4e0534a6b..7d5e8f23e 100644 --- a/apps/playwright-service-ts/api.ts +++ b/apps/playwright-service-ts/api.ts @@ -3,6 +3,8 @@ import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest, import dotenv from 'dotenv'; import UserAgent from 'user-agents'; import { getError } from './helpers/get_error'; +import { lookup } from 'dns/promises'; +import IPAddr from 'ipaddr.js'; dotenv.config(); @@ -13,10 +15,106 @@ app.use(express.json()); const BLOCK_MEDIA = (process.env.BLOCK_MEDIA || 'False').toUpperCase() === 'TRUE'; const MAX_CONCURRENT_PAGES = Math.max(1, Number.parseInt(process.env.MAX_CONCURRENT_PAGES ?? '10', 10) || 10); +const ALLOW_LOCAL_WEBHOOKS = (process.env.ALLOW_LOCAL_WEBHOOKS || 'False').toUpperCase() === 'TRUE'; +const DNS_CACHE_TTL_MS = 30_000; const PROXY_SERVER = process.env.PROXY_SERVER || null; const PROXY_USERNAME = process.env.PROXY_USERNAME || null; const PROXY_PASSWORD = process.env.PROXY_PASSWORD || null; +const dnsLookupCache = new Map(); + +class InsecureConnectionError extends Error { + constructor(public readonly blockedUrl: string, reason: string) { + super(`Blocked insecure target URL "${blockedUrl}": ${reason}`); + this.name = 'InsecureConnectionError'; + } +} + +const normalizeHostname = (hostname: string): string => hostname.toLowerCase().replace(/\.$/, ''); + +const isHttpProtocol = (protocol: string): boolean => protocol === 'http:' || protocol === 'https:'; + +const isIPPrivate = (address: string): boolean => { + if (!IPAddr.isValid(address)) return false; + const parsedAddress = IPAddr.parse(address); + return parsedAddress.range() !== 'unicast'; +}; + +const isLocalHostname = (hostname: string): boolean => + hostname === 'localhost' || hostname.endsWith('.localhost'); + +const lookupWithCache = async (hostname: string): Promise => { + const cached = dnsLookupCache.get(hostname); + if (cached && cached.expiresAt > Date.now()) { + return cached.addresses; + } + + const resolvedAddresses = await lookup(hostname, { all: true, verbatim: true }); + const uniqueAddresses = [...new Set(resolvedAddresses.map(x => x.address))]; + dnsLookupCache.set(hostname, { + addresses: uniqueAddresses, + expiresAt: Date.now() + DNS_CACHE_TTL_MS, + }); + return uniqueAddresses; +}; + +const assertSafeTargetUrl = async (urlString: string): Promise => { + let parsedUrl: URL; + try { + parsedUrl = new URL(urlString); + } catch { + throw new InsecureConnectionError(urlString, 'URL is invalid'); + } + + if (!isHttpProtocol(parsedUrl.protocol)) { + throw new InsecureConnectionError(urlString, `unsupported protocol "${parsedUrl.protocol}"`); + } + + if (ALLOW_LOCAL_WEBHOOKS) { + return; + } + + const hostname = normalizeHostname(parsedUrl.hostname); + if (!hostname) { + throw new InsecureConnectionError(urlString, 'hostname is missing'); + } + + if (isLocalHostname(hostname)) { + throw new InsecureConnectionError(urlString, 'localhost targets are not allowed'); + } + + if (IPAddr.isValid(hostname)) { + if (isIPPrivate(hostname)) { + throw new InsecureConnectionError(urlString, `private IP "${hostname}" is not allowed`); + } + return; + } + + let resolvedAddresses: string[]; + try { + resolvedAddresses = await lookupWithCache(hostname); + } catch { + throw new InsecureConnectionError( + urlString, + `DNS lookup failed for "${hostname}", cannot verify target is safe`, + ); + } + + if (resolvedAddresses.length === 0) { + throw new InsecureConnectionError( + urlString, + `hostname "${hostname}" did not resolve to any IP address`, + ); + } + + if (resolvedAddresses.some(address => isIPPrivate(address))) { + throw new InsecureConnectionError(urlString, `hostname "${hostname}" resolves to a private IP`); + } +}; + +type ContextSecurityState = { + blockedNavigationRequestUrl: string | null; +}; class Semaphore { private permits: number; private queue: (() => void)[] = []; @@ -99,14 +197,18 @@ const initializeBrowser = async () => { }); }; -const createContext = async (skipTlsVerification: boolean = false) => { +const createContext = async (skipTlsVerification: boolean = false): Promise<{ context: BrowserContext; securityState: ContextSecurityState }> => { const userAgent = new UserAgent().toString(); const viewport = { width: 1280, height: 800 }; + const securityState: ContextSecurityState = { + blockedNavigationRequestUrl: null, + }; const contextOptions: any = { userAgent, viewport, ignoreHTTPSErrors: skipTlsVerification, + serviceWorkers: 'block', }; if (PROXY_SERVER && PROXY_USERNAME && PROXY_PASSWORD) { @@ -130,9 +232,23 @@ const createContext = async (skipTlsVerification: boolean = false) => { } // Intercept all requests to avoid loading ads - await newContext.route('**/*', (route: Route, request: PlaywrightRequest) => { - const requestUrl = new URL(request.url()); - const hostname = requestUrl.hostname; + await newContext.route('**/*', async (route: Route, request: PlaywrightRequest) => { + const requestUrlString = request.url(); + try { + await assertSafeTargetUrl(requestUrlString); + } catch (error) { + if (error instanceof InsecureConnectionError) { + if (request.isNavigationRequest()) { + securityState.blockedNavigationRequestUrl = requestUrlString; + } + console.warn(`Blocked request: ${requestUrlString}`); + return route.abort('blockedbyclient'); + } + throw error; + } + + const requestUrl = new URL(requestUrlString); + const hostname = normalizeHostname(requestUrl.hostname); if (AD_SERVING_DOMAINS.some(domain => hostname.includes(domain))) { console.log(hostname); @@ -141,7 +257,7 @@ const createContext = async (skipTlsVerification: boolean = false) => { return route.continue(); }); - return newContext; + return { context: newContext, securityState }; }; const shutdownBrowser = async () => { @@ -159,9 +275,28 @@ const isValidUrl = (urlString: string): boolean => { } }; -const scrapePage = async (page: Page, url: string, waitUntil: 'load' | 'networkidle', waitAfterLoad: number, timeout: number, checkSelector: string | undefined) => { +const scrapePage = async ( + page: Page, + url: string, + waitUntil: 'load' | 'networkidle', + waitAfterLoad: number, + timeout: number, + checkSelector: string | undefined, + securityState: ContextSecurityState, +) => { console.log(`Navigating to ${url} with waitUntil: ${waitUntil} and timeout: ${timeout}ms`); - const response = await page.goto(url, { waitUntil, timeout }); + let response; + try { + response = await page.goto(url, { waitUntil, timeout }); + } catch (error) { + if (securityState.blockedNavigationRequestUrl) { + throw new InsecureConnectionError( + securityState.blockedNavigationRequestUrl, + 'navigation to private/internal resource is not allowed', + ); + } + throw error; + } if (waitAfterLoad > 0) { await page.waitForTimeout(waitAfterLoad); @@ -199,7 +334,7 @@ app.get('/health', async (req: Request, res: Response) => { await initializeBrowser(); } - const testContext = await createContext(); + const { context: testContext } = await createContext(); const testPage = await testContext.newPage(); await testPage.close(); await testContext.close(); @@ -238,6 +373,19 @@ app.post('/scrape', async (req: Request, res: Response) => { return res.status(400).json({ error: 'Invalid URL' }); } + try { + await assertSafeTargetUrl(url); + } catch (error) { + if (error instanceof InsecureConnectionError) { + return res.json({ + content: '', + pageStatusCode: 403, + pageError: error.message, + }); + } + throw error; + } + if (!PROXY_SERVER) { console.warn('⚠️ WARNING: No proxy server provided. Your IP address may be blocked.'); } @@ -249,17 +397,28 @@ app.post('/scrape', async (req: Request, res: Response) => { await pageSemaphore.acquire(); let requestContext: BrowserContext | null = null; + let securityState: ContextSecurityState | null = null; let page: Page | null = null; try { - requestContext = await createContext(skip_tls_verification); + const contextBundle = await createContext(skip_tls_verification); + requestContext = contextBundle.context; + securityState = contextBundle.securityState; page = await requestContext.newPage(); if (headers) { await page.setExtraHTTPHeaders(headers); } - const result = await scrapePage(page, url, 'load', wait_after_load, timeout, check_selector); + const result = await scrapePage( + page, + url, + 'load', + wait_after_load, + timeout, + check_selector, + securityState, + ); const pageError = result.status !== 200 ? getError(result.status) : undefined; if (!pageError) { @@ -276,6 +435,13 @@ app.post('/scrape', async (req: Request, res: Response) => { }); } catch (error) { + if (error instanceof InsecureConnectionError) { + return res.json({ + content: '', + pageStatusCode: 403, + pageError: error.message, + }); + } console.error('Scrape error:', error); res.status(500).json({ error: 'An error occurred while fetching the page.' }); } finally { diff --git a/apps/playwright-service-ts/package.json b/apps/playwright-service-ts/package.json index ebf1df69d..ead74c048 100644 --- a/apps/playwright-service-ts/package.json +++ b/apps/playwright-service-ts/package.json @@ -14,6 +14,7 @@ "dependencies": { "dotenv": "^16.4.5", "express": "^5.2.1", + "ipaddr.js": "^2.3.0", "playwright": "^1.58.1", "user-agents": "^1.1.669" }, diff --git a/apps/playwright-service-ts/pnpm-lock.yaml b/apps/playwright-service-ts/pnpm-lock.yaml index fc834c420..fde4d0c91 100644 --- a/apps/playwright-service-ts/pnpm-lock.yaml +++ b/apps/playwright-service-ts/pnpm-lock.yaml @@ -14,6 +14,9 @@ importers: express: specifier: ^5.2.1 version: 5.2.1 + ipaddr.js: + specifier: ^2.3.0 + version: 2.3.0 playwright: specifier: ^1.58.1 version: 1.58.1 @@ -386,6 +389,10 @@ packages: resolution: {integrity: sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==} engines: {node: '>= 0.10'} + ipaddr.js@2.3.0: + resolution: {integrity: sha512-Zv/pA+ciVFbCSBBjGfaKUya/CcGmUHzTydLMaTwrUUEM2DIEO3iZvueGxmacvmN50fGpGVKeTXpb2LcYQxeVdg==} + engines: {node: '>= 10'} + is-promise@4.0.0: resolution: {integrity: sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==} @@ -872,6 +879,8 @@ snapshots: ipaddr.js@1.9.1: {} + ipaddr.js@2.3.0: {} + is-promise@4.0.0: {} lodash.clonedeep@4.5.0: {} diff --git a/docker-compose.yaml b/docker-compose.yaml index 0dfe14f8f..d925f80f7 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -67,6 +67,7 @@ services: PROXY_SERVER: ${PROXY_SERVER} PROXY_USERNAME: ${PROXY_USERNAME} PROXY_PASSWORD: ${PROXY_PASSWORD} + ALLOW_LOCAL_WEBHOOKS: ${ALLOW_LOCAL_WEBHOOKS} BLOCK_MEDIA: ${BLOCK_MEDIA} # Configure maximum concurrent pages for Playwright browser instances MAX_CONCURRENT_PAGES: ${CRAWL_CONCURRENT_REQUESTS:-10} diff --git a/examples/kubernetes/cluster-install/playwright-service.yaml b/examples/kubernetes/cluster-install/playwright-service.yaml index b40cfa835..5298a4584 100644 --- a/examples/kubernetes/cluster-install/playwright-service.yaml +++ b/examples/kubernetes/cluster-install/playwright-service.yaml @@ -4,6 +4,7 @@ metadata: name: playwright-service-config data: PORT: "3000" + ALLOW_LOCAL_WEBHOOKS: "false" --- apiVersion: apps/v1 kind: Deployment diff --git a/examples/kubernetes/firecrawl-helm/templates/playwright-configmap.yaml b/examples/kubernetes/firecrawl-helm/templates/playwright-configmap.yaml index f3be19085..bab3e047a 100644 --- a/examples/kubernetes/firecrawl-helm/templates/playwright-configmap.yaml +++ b/examples/kubernetes/firecrawl-helm/templates/playwright-configmap.yaml @@ -4,5 +4,6 @@ metadata: name: {{ include "firecrawl.fullname" . }}-playwright-config data: PORT: {{ default (printf "%v" .Values.service.playwright.port) .Values.playwrightConfig.PORT | quote }} + ALLOW_LOCAL_WEBHOOKS: {{ .Values.playwrightConfig.ALLOW_LOCAL_WEBHOOKS | quote }} BLOCK_MEDIA: {{ .Values.playwrightConfig.BLOCK_MEDIA | quote }} MAX_CONCURRENT_PAGES: {{ .Values.playwrightConfig.MAX_CONCURRENT_PAGES | quote }} diff --git a/examples/kubernetes/firecrawl-helm/values.yaml b/examples/kubernetes/firecrawl-helm/values.yaml index afa9868fb..b809ee188 100644 --- a/examples/kubernetes/firecrawl-helm/values.yaml +++ b/examples/kubernetes/firecrawl-helm/values.yaml @@ -160,6 +160,7 @@ config: playwrightConfig: PORT: "3000" + ALLOW_LOCAL_WEBHOOKS: "" BLOCK_MEDIA: "" MAX_CONCURRENT_PAGES: "10"