Skip to content

Commit b41e2b7

Browse files
committed
Refactor fetch_html to use shared session browser code
Replaces the duplicate browser launch/wait/extract logic in fetch_html with calls to session::launch_browser and session::extract_page_content. Deletes fetch_with_browser entirely. fetch_html is now a thin wrapper: launch → extract → tear down, using the same code path as persistent browser sessions. https://claude.ai/code/session_01QvEcr1eaxtqwA2Ry5DGwhp
1 parent fc17f47 commit b41e2b7

2 files changed

Lines changed: 20 additions & 210 deletions

File tree

src/tools/browser/mod.rs

Lines changed: 15 additions & 205 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,6 @@ pub mod session;
3535
use std::fs;
3636
use std::path::Path;
3737
use std::sync::OnceLock;
38-
use std::time::Duration;
39-
40-
use chromiumoxide::browser::{Browser, BrowserConfig, HeadlessMode};
41-
use chromiumoxide::handler::viewport::Viewport;
42-
use futures::StreamExt;
4338

4439
use crate::config::BrowserConfig as AppBrowserConfig;
4540

@@ -163,11 +158,9 @@ fn which_browser(name: &str) -> Option<String> {
163158

164159
/// Fetch HTML content using headless browser and convert to readable markdown
165160
///
166-
/// This function:
167-
/// 1. Requires Chrome/Chromium to be installed
168-
/// 2. Renders page with headless browser (handles SPAs/JS)
169-
/// 3. Applies readability algorithm to extract main content
170-
/// 4. Converts to markdown for token-efficient representation
161+
/// This is the one-shot convenience function. It launches a browser, navigates,
162+
/// waits for the page to settle, extracts readable content, and tears down the
163+
/// browser — all using the same code path as persistent browser sessions.
171164
///
172165
/// Browser settings (executable path, profile) are read from the global
173166
/// BrowserContext initialized at app startup.
@@ -184,30 +177,21 @@ pub async fn fetch_html(url: &str, max_length: Option<usize>) -> Result<FetchHtm
184177
));
185178
}
186179

187-
// Get browser settings from global context
188-
let ctx = browser_context();
189-
190-
// Resolve browser executable: context -> auto-detect
191-
let browser_path = ctx
192-
.and_then(|c| c.chrome_executable.clone())
193-
.or_else(detect_browser)
194-
.ok_or_else(|| {
195-
"No Chrome/Chromium browser found. Install chromium or google-chrome to use this tool."
196-
.to_string()
197-
})?;
198-
199-
// Get settings from context
200-
let ctx = ctx.cloned().unwrap_or_default();
180+
// Launch browser, navigate, wait for settle — shared with session module
181+
let (mut browser, page, handler_task, temp_dir) = session::launch_browser(url).await?;
201182

202-
let html = fetch_with_browser(url, &browser_path, &ctx).await?;
183+
// Extract content
184+
let content = session::extract_page_content(&page, url).await?;
203185

204-
// Apply readability to extract main content
205-
let readable = extract_readable_content(&html, url)?;
206-
207-
// Convert to markdown
208-
let mut markdown = html_to_markdown(&readable.content);
186+
// Tear down immediately (one-shot, no persistent session)
187+
let _ = browser.close().await;
188+
handler_task.abort();
189+
if let Some(ref temp) = temp_dir {
190+
let _ = std::fs::remove_dir_all(temp);
191+
}
209192

210193
// Truncate if needed
194+
let mut markdown = content.markdown;
211195
if markdown.len() > max_length {
212196
let mut end = max_length;
213197
while end > 0 && !markdown.is_char_boundary(end) {
@@ -223,185 +207,11 @@ pub async fn fetch_html(url: &str, max_length: Option<usize>) -> Result<FetchHtm
223207

224208
Ok(FetchHtmlResult {
225209
content: markdown,
226-
title: readable.title,
210+
title: content.title,
227211
url: url.to_string(),
228212
})
229213
}
230214

231-
/// Fetch page using headless browser (handles JavaScript rendering)
232-
///
233-
/// # Arguments
234-
/// * `url` - The URL to fetch
235-
/// * `browser_path` - Path to Chrome/Chromium executable
236-
/// * `ctx` - Browser context with profile, viewport, and timing settings
237-
async fn fetch_with_browser(
238-
url: &str,
239-
browser_path: &str,
240-
ctx: &BrowserContext,
241-
) -> Result<String, String> {
242-
// Copy profile to isolated temp directory to avoid SingletonLock conflicts.
243-
// See module doc comment for full explanation of why this is necessary.
244-
let temp_dir = if let (Some(data_dir), Some(prof)) =
245-
(&ctx.chrome_user_data_dir, &ctx.chrome_profile)
246-
{
247-
let source_profile = std::path::Path::new(data_dir).join(prof);
248-
if source_profile.exists() {
249-
// Use PID to isolate temp dirs between concurrent Codey instances
250-
let temp_base =
251-
std::env::temp_dir().join(format!("codey-browser-{}", std::process::id()));
252-
// Clean up any previous temp dir to ensure fresh copy
253-
if temp_base.exists() {
254-
let _ = std::fs::remove_dir_all(&temp_base);
255-
}
256-
std::fs::create_dir_all(&temp_base)
257-
.map_err(|e| format!("Failed to create temp browser dir: {}", e))?;
258-
259-
// Copy profile to temp_base/Default (becomes the default profile)
260-
let dest_profile = temp_base.join("Default");
261-
copy_dir_recursive(&source_profile, &dest_profile)
262-
.map_err(|e| format!("Failed to copy browser profile: {}", e))?;
263-
264-
Some(temp_base)
265-
} else {
266-
return Err(format!(
267-
"Browser profile not found: {}",
268-
source_profile.display()
269-
));
270-
}
271-
} else {
272-
None
273-
};
274-
275-
// Configure browser
276-
let headless_mode = if ctx.headless {
277-
HeadlessMode::True
278-
} else {
279-
HeadlessMode::False
280-
};
281-
282-
// When using a copied profile, we need real Keychain access to decrypt cookies.
283-
//
284-
// chromiumoxide's DEFAULT_ARGS (from Puppeteer) include:
285-
// "--password-store=basic" - Uses basic password store instead of OS keychain
286-
// "--use-mock-keychain" - Uses mock keychain for testing
287-
//
288-
// These flags are designed for automation/testing where you don't want system prompts,
289-
// but they prevent Chrome from decrypting cookies that were encrypted with the real
290-
// Keychain key. We must disable defaults and provide our own args list.
291-
//
292-
// Reference: chromiumoxide 0.7.0 browser.rs DEFAULT_ARGS
293-
// Original source: https://github.com/nickelc/chromiumoxide/blob/v0.7.0/src/browser.rs
294-
let using_profile = temp_dir.is_some();
295-
296-
let mut config = BrowserConfig::builder()
297-
.no_sandbox()
298-
.headless_mode(headless_mode)
299-
.window_size(ctx.viewport_width, ctx.viewport_height)
300-
.viewport(Viewport {
301-
width: ctx.viewport_width,
302-
height: ctx.viewport_height,
303-
device_scale_factor: None,
304-
emulating_mobile: false,
305-
is_landscape: false,
306-
has_touch: false,
307-
});
308-
309-
if using_profile {
310-
// Disable chromiumoxide defaults and add our own (without mock keychain flags)
311-
config = config.disable_default_args().args([
312-
"--disable-background-networking",
313-
"--enable-features=NetworkService,NetworkServiceInProcess",
314-
"--disable-background-timer-throttling",
315-
"--disable-backgrounding-occluded-windows",
316-
"--disable-breakpad",
317-
"--disable-client-side-phishing-detection",
318-
"--disable-component-extensions-with-background-pages",
319-
"--disable-default-apps",
320-
"--disable-dev-shm-usage",
321-
"--disable-extensions",
322-
"--disable-features=TranslateUI",
323-
"--disable-hang-monitor",
324-
"--disable-ipc-flooding-protection",
325-
"--disable-popup-blocking",
326-
"--disable-prompt-on-repost",
327-
"--disable-renderer-backgrounding",
328-
"--disable-sync",
329-
"--force-color-profile=srgb",
330-
"--metrics-recording-only",
331-
"--no-first-run",
332-
"--enable-automation",
333-
// OMITTED: "--password-store=basic" - need real password store for cookies
334-
// OMITTED: "--use-mock-keychain" - need real Keychain access
335-
"--enable-blink-features=IdleDetection",
336-
"--lang=en_US",
337-
"--disable-gpu",
338-
]);
339-
} else {
340-
config = config.arg("--disable-gpu");
341-
}
342-
343-
config = config.chrome_executable(browser_path);
344-
345-
// Use temp dir if we copied a profile, otherwise use user_data_dir directly
346-
if let Some(ref temp) = temp_dir {
347-
config = config.user_data_dir(temp);
348-
} else if let Some(ref dir) = ctx.chrome_user_data_dir {
349-
// Fallback: use user_data_dir directly (may conflict with running Chrome)
350-
config = config.user_data_dir(dir);
351-
if let Some(ref prof) = ctx.chrome_profile {
352-
config = config.arg(format!("--profile-directory={}", prof));
353-
}
354-
}
355-
356-
let config = config
357-
.build()
358-
.map_err(|e| format!("Failed to configure browser: {:?}", e))?;
359-
360-
// Launch browser with timeout
361-
let launch_result =
362-
tokio::time::timeout(Duration::from_secs(30), Browser::launch(config)).await;
363-
364-
let (mut browser, mut handler) = match launch_result {
365-
Ok(Ok((browser, handler))) => (browser, handler),
366-
Ok(Err(e)) => return Err(format!("Failed to launch browser: {}", e)),
367-
Err(_) => return Err("Browser launch timed out after 30 seconds".to_string()),
368-
};
369-
370-
// Spawn handler task (required by chromiumoxide)
371-
let handle = tokio::spawn(async move {
372-
while let Some(_event) = handler.next().await {
373-
// Process browser events
374-
}
375-
});
376-
377-
// Navigate to page
378-
let page_result = tokio::time::timeout(Duration::from_secs(60), async {
379-
let page = browser
380-
.new_page(url)
381-
.await
382-
.map_err(|e| format!("Failed to navigate: {}", e))?;
383-
384-
// Wait for JavaScript to render (configurable for SPAs like Twitter)
385-
tokio::time::sleep(Duration::from_millis(ctx.page_load_wait_ms)).await;
386-
387-
// Get rendered HTML
388-
page.content()
389-
.await
390-
.map_err(|e| format!("Failed to get page content: {}", e))
391-
})
392-
.await;
393-
394-
// Clean up
395-
let _ = browser.close().await;
396-
handle.abort();
397-
398-
match page_result {
399-
Ok(Ok(html)) => Ok(html),
400-
Ok(Err(e)) => Err(e),
401-
Err(_) => Err("Page load timed out after 60 seconds".to_string()),
402-
}
403-
}
404-
405215
/// Recursively copy a directory and its contents
406216
pub(crate) fn copy_dir_recursive(src: &Path, dst: &Path) -> std::io::Result<()> {
407217
fs::create_dir_all(dst)?;

src/tools/browser/session.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -398,13 +398,13 @@ impl BrowserSessionManager {
398398
// =============================================================================
399399

400400
/// Extracted page content
401-
struct PageContent {
402-
title: Option<String>,
403-
markdown: String,
401+
pub(crate) struct PageContent {
402+
pub(crate) title: Option<String>,
403+
pub(crate) markdown: String,
404404
}
405405

406406
/// Launch a new browser instance and navigate to the given URL.
407-
async fn launch_browser(
407+
pub(crate) async fn launch_browser(
408408
url: &str,
409409
) -> Result<(Browser, Page, JoinHandle<()>, Option<std::path::PathBuf>), String> {
410410
let ctx = browser_context().cloned().unwrap_or_default();
@@ -596,7 +596,7 @@ async fn get_current_url(page: &Page) -> Option<String> {
596596
}
597597

598598
/// Extract readable content from the current page.
599-
async fn extract_page_content(page: &Page, url: &str) -> Result<PageContent, String> {
599+
pub(crate) async fn extract_page_content(page: &Page, url: &str) -> Result<PageContent, String> {
600600
let html = page
601601
.content()
602602
.await

0 commit comments

Comments
 (0)