|
7 | 7 | import pathlib |
8 | 8 | import re |
9 | 9 | from typing import Any, Callable, Dict, List, Optional |
| 10 | +from urllib.parse import urlparse, urljoin, urlencode, parse_qs |
10 | 11 |
|
11 | 12 | from playwright.async_api import Page |
12 | 13 | from playwright.async_api import async_playwright |
@@ -287,59 +288,313 @@ async def _handle_scroll(page: Page, step: BaseStep) -> None: |
287 | 288 |
|
288 | 289 |
|
289 | 290 | async def _handle_save_pdf(page: Page, step: BaseStep, collector: Dict[str, Any]) -> None: |
290 | | - """Handle savePDF action""" |
| 291 | + """Handle savePDF action - attempts to download actual PDF binary before falling back to page.pdf""" |
291 | 292 | if not step.value: |
292 | 293 | raise ValueError(f"savePDF step {step.id} requires 'value' as target filepath") |
293 | 294 |
|
294 | | - key = step.key or step.id or "file" |
| 295 | + collector_key = step.key or step.id or "file" |
295 | 296 | saved_path: Optional[str] = None |
| 297 | + target_path_base: str = step.value |
| 298 | + |
296 | 299 | try: |
297 | | - # Try to ensure the page is ready |
| 300 | + # Ensure the page finished initial navigation |
298 | 301 | try: |
299 | 302 | await page.wait_for_load_state("domcontentloaded", timeout=step.wait or 600000) |
300 | 303 | except Exception: |
301 | 304 | pass |
302 | 305 |
|
303 | | - # crude readiness loop |
304 | | - pdf_ready = False |
305 | | - for attempt in range(15): |
| 306 | + # Try to resolve the direct PDF URL |
| 307 | + pdf_url: Optional[str] = None |
| 308 | + |
| 309 | + # 1) If the current URL points to a PDF (anywhere in the URL), use it or extract from query |
| 310 | + current_url = page.url |
| 311 | + print(f" 📄 Current URL: {current_url}") |
| 312 | + try: |
| 313 | + parsed_url = urlparse(current_url) |
| 314 | + query_params = parse_qs(parsed_url.query) |
| 315 | + candidates = [] |
| 316 | + for param in ["file", "src", "document", "url"]: |
| 317 | + if param in query_params and query_params[param]: |
| 318 | + val = query_params[param][0] |
| 319 | + if re.search(r"\.pdf", val, re.IGNORECASE): |
| 320 | + candidates.append(val) |
| 321 | + |
| 322 | + if candidates: |
| 323 | + param_pdf = candidates[0] |
| 324 | + pdf_url = urljoin(current_url, param_pdf) |
| 325 | + except Exception: |
| 326 | + pass |
| 327 | + |
| 328 | + if not pdf_url and re.search(r"\.pdf", current_url, re.IGNORECASE): |
| 329 | + pdf_url = current_url |
| 330 | + |
| 331 | + # 2) Otherwise, try to discover PDF source from common viewer elements |
| 332 | + if not pdf_url: |
306 | 333 | try: |
307 | | - info = await page.evaluate( |
| 334 | + pdf_url = await page.evaluate( |
308 | 335 | """() => { |
309 | | - const viewer = document.querySelector('embed[type="application/pdf"]') |
310 | | - || document.querySelector('object[type="application/pdf"]') |
311 | | - || document.querySelector('iframe[src*=".pdf"]') |
312 | | - || document.querySelector('.pdf-viewer') |
313 | | - || document.querySelector('[data-pdf]'); |
314 | | - const bodyText = document.body ? document.body.innerText : ''; |
315 | | - const substantial = bodyText.length > 200; |
316 | | - const pdfText = /PDF|Page|Agenda|Meeting/.test(bodyText); |
317 | | - return {viewer: !!viewer, substantial, len: bodyText.length, pdfText}; |
| 336 | + const getAbs = (src) => { |
| 337 | + if (!src) return null; |
| 338 | + try { |
| 339 | + return new URL(src, window.location.href).toString(); |
| 340 | + } catch { |
| 341 | + return src; |
| 342 | + } |
| 343 | + }; |
| 344 | +
|
| 345 | + const embed = document.querySelector('embed[type="application/pdf"]'); |
| 346 | + if (embed && embed.getAttribute('src')) return getAbs(embed.getAttribute('src')); |
| 347 | +
|
| 348 | + const objectEl = document.querySelector('object[type="application/pdf"]'); |
| 349 | + if (objectEl && objectEl.getAttribute('data')) return getAbs(objectEl.getAttribute('data')); |
| 350 | +
|
| 351 | + const iframes = Array.from(document.querySelectorAll('iframe')); |
| 352 | + const iframe = iframes.find(f => { |
| 353 | + const s = f.getAttribute('src') || ''; |
| 354 | + return /\.pdf/i.test(s) || s.includes('pdf'); |
| 355 | + }); |
| 356 | + if (iframe && iframe.getAttribute('src')) return getAbs(iframe.getAttribute('src')); |
| 357 | +
|
| 358 | + return null; |
318 | 359 | }""" |
319 | 360 | ) |
320 | | - if info.get("substantial") or info.get("pdfText"): |
321 | | - pdf_ready = True |
322 | | - break |
323 | | - await page.wait_for_timeout(2000) |
324 | 361 | except Exception: |
325 | | - await page.wait_for_timeout(2000) |
| 362 | + pass |
326 | 363 |
|
327 | | - if step.wait and step.wait > 0: |
| 364 | + # 3) Additional wait if requested (helps some viewers populate 'src') |
| 365 | + if not pdf_url and step.wait and step.wait > 0: |
328 | 366 | await page.wait_for_timeout(step.wait) |
| 367 | + try: |
| 368 | + # Try again once after waiting |
| 369 | + pdf_url = await page.evaluate( |
| 370 | + """() => { |
| 371 | + const iframes = Array.from(document.querySelectorAll('iframe')); |
| 372 | + const iframe = iframes.find(f => f.getAttribute('src')); |
| 373 | + return iframe ? iframe.src : null; |
| 374 | + }""" |
| 375 | + ) |
| 376 | + except Exception: |
| 377 | + pass |
329 | 378 |
|
330 | | - # Print to PDF (Chromium-only) |
331 | | - pdf_bytes = await page.pdf(format="A4") |
| 379 | + # If we couldn't find a PDF URL, try fallback methods |
| 380 | + if not pdf_url: |
| 381 | + print(" 📄 Direct PDF URL not found. Trying viewer download fallback...") |
| 382 | + # Will try viewer download methods below |
| 383 | + else: |
| 384 | + # Build candidate URLs and try them until one succeeds |
| 385 | + candidates: List[str] = [] |
| 386 | + is_absolute = bool(re.match(r"^https?:", pdf_url, re.IGNORECASE)) |
| 387 | + |
| 388 | + if is_absolute: |
| 389 | + candidates.append(pdf_url) |
| 390 | + else: |
| 391 | + # 1) Same-origin resolution |
| 392 | + candidates.append(urljoin(current_url, pdf_url)) |
| 393 | + |
| 394 | + # No site-specific heuristics; keep candidates generic only |
| 395 | + |
| 396 | + # Log URLs for debugging |
| 397 | + print(f" 📄 Current URL: {current_url}") |
| 398 | + print(f" 📄 Candidate PDF URLs: {candidates}") |
| 399 | + |
| 400 | + # Download the first successful candidate |
| 401 | + downloaded_buffer: Optional[bytes] = None |
| 402 | + for candidate_url in candidates: |
| 403 | + try: |
| 404 | + ctx = page.context |
| 405 | + cookies = await ctx.cookies(candidate_url) |
| 406 | + cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookies) if cookies else "" |
| 407 | + |
| 408 | + async with async_playwright() as p: |
| 409 | + api = await p.request.new_context( |
| 410 | + extra_http_headers={ |
| 411 | + **({"Cookie": cookie_header} if cookie_header else {}), |
| 412 | + "Referer": current_url, |
| 413 | + "User-Agent": "Mozilla/5.0", |
| 414 | + } |
| 415 | + ) |
| 416 | + res = await api.get(candidate_url) |
| 417 | + if res.ok: |
| 418 | + downloaded_buffer = await res.body() |
| 419 | + await api.dispose() |
| 420 | + pdf_url = candidate_url # final URL used |
| 421 | + break |
| 422 | + else: |
| 423 | + print(f" 📄 GET {candidate_url} -> {res.status} {res.status_text()}") |
| 424 | + await api.dispose() |
| 425 | + except Exception as e: |
| 426 | + print(f" 📄 GET {candidate_url} failed: {e}") |
| 427 | + |
| 428 | + if downloaded_buffer: |
| 429 | + resolved_path = replace_data_placeholders(target_path_base, collector) or target_path_base |
| 430 | + await _ensure_dir(resolved_path) |
| 431 | + with open(resolved_path, "wb") as f: |
| 432 | + f.write(downloaded_buffer) |
| 433 | + saved_path = resolved_path |
| 434 | + print(f" 📄 PDF saved to {resolved_path} (from {pdf_url})") |
| 435 | + else: |
| 436 | + pdf_url = None # Reset to trigger fallback |
| 437 | + |
| 438 | + # Fallback: viewer download methods |
| 439 | + if not saved_path: |
| 440 | + if not pdf_url: |
| 441 | + print(" 📄 All candidate PDF URLs failed. Trying viewer download fallback...") |
| 442 | + |
| 443 | + # Main page attempt (deep shadow click only) |
| 444 | + saved = False |
| 445 | + |
| 446 | + # Try clicking download buttons in main page |
| 447 | + try: |
| 448 | + async with page.expect_download(timeout=5000) as dl_info: |
| 449 | + clicked_main = await page.evaluate( |
| 450 | + """async () => { |
| 451 | + const targetIds = ['download', 'save']; |
| 452 | + const visited = new Set(); |
| 453 | + |
| 454 | + function tryClick(node) { |
| 455 | + if (visited.has(node)) return false; |
| 456 | + visited.add(node); |
| 457 | + const el = node; |
| 458 | + if (el && el.id && targetIds.includes(el.id)) { |
| 459 | + el.click(); |
| 460 | + return true; |
| 461 | + } |
| 462 | + const elem = node; |
| 463 | + if (!elem) return false; |
| 464 | + const sr = elem.shadowRoot; |
| 465 | + if (sr) { |
| 466 | + for (const child of Array.from(sr.children)) { |
| 467 | + if (tryClick(child)) return true; |
| 468 | + } |
| 469 | + } |
| 470 | + for (const child of Array.from(elem.children)) { |
| 471 | + if (tryClick(child)) return true; |
| 472 | + } |
| 473 | + return false; |
| 474 | + } |
| 475 | + return tryClick(document.documentElement); |
| 476 | + }""" |
| 477 | + ) |
| 478 | + |
| 479 | + if clicked_main: |
| 480 | + dl = await dl_info.value |
| 481 | + resolved_path = replace_data_placeholders(target_path_base, collector) or target_path_base |
| 482 | + await _ensure_dir(resolved_path) |
| 483 | + await dl.save_as(resolved_path) |
| 484 | + saved_path = resolved_path |
| 485 | + print(f" 📄 PDF saved via viewer download to {resolved_path}") |
| 486 | + saved = True |
| 487 | + except Exception: |
| 488 | + pass |
| 489 | + |
| 490 | + # Frames attempt |
| 491 | + if not saved: |
| 492 | + all_frames = page.frames |
| 493 | + for frame in all_frames: |
| 494 | + if frame == page.main_frame: |
| 495 | + continue |
| 496 | + try: |
| 497 | + async with page.expect_download(timeout=5000) as dl_info: |
| 498 | + clicked = await frame.evaluate( |
| 499 | + """async () => { |
| 500 | + const targetIds = ['download', 'save']; |
| 501 | + const visited = new Set(); |
| 502 | + |
| 503 | + function tryClick(node) { |
| 504 | + if (visited.has(node)) return false; |
| 505 | + visited.add(node); |
| 506 | + const el = node; |
| 507 | + if (el && el.id && targetIds.includes(el.id)) { |
| 508 | + el.click(); |
| 509 | + return true; |
| 510 | + } |
| 511 | + const elem = node; |
| 512 | + if (!elem) return false; |
| 513 | + const sr = elem.shadowRoot; |
| 514 | + if (sr) { |
| 515 | + for (const child of Array.from(sr.children)) { |
| 516 | + if (tryClick(child)) return true; |
| 517 | + } |
| 518 | + } |
| 519 | + for (const child of Array.from(elem.children)) { |
| 520 | + if (tryClick(child)) return true; |
| 521 | + } |
| 522 | + return false; |
| 523 | + } |
| 524 | + return tryClick(document.documentElement); |
| 525 | + }""" |
| 526 | + ) |
| 527 | + |
| 528 | + if clicked: |
| 529 | + dl = await dl_info.value |
| 530 | + resolved_path = replace_data_placeholders(target_path_base, collector) or target_path_base |
| 531 | + await _ensure_dir(resolved_path) |
| 532 | + await dl.save_as(resolved_path) |
| 533 | + saved_path = resolved_path |
| 534 | + print(f" 📄 PDF saved via viewer download to {resolved_path}") |
| 535 | + saved = True |
| 536 | + break |
| 537 | + except Exception: |
| 538 | + continue |
| 539 | + |
| 540 | + # Non-click fallback: try to scrape a direct download link href and fetch it |
| 541 | + if not saved: |
| 542 | + try: |
| 543 | + hrefs = await page.evaluate( |
| 544 | + """() => { |
| 545 | + const links = []; |
| 546 | + const anchors = Array.from(document.querySelectorAll('a')); |
| 547 | + for (const a of anchors) { |
| 548 | + const text = (a.textContent || '').toLowerCase(); |
| 549 | + const aria = (a.getAttribute('aria-label') || '').toLowerCase(); |
| 550 | + if (a.hasAttribute('download') || text.includes('download') || aria.includes('download')) { |
| 551 | + if (a.href) links.push(a.href); |
| 552 | + } |
| 553 | + } |
| 554 | + return links.slice(0, 3); |
| 555 | + }""" |
| 556 | + ) |
| 557 | + if hrefs and len(hrefs) > 0: |
| 558 | + for href in hrefs: |
| 559 | + try: |
| 560 | + ctx = page.context |
| 561 | + cookies = await ctx.cookies(href) |
| 562 | + cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookies) if cookies else "" |
| 563 | + |
| 564 | + async with async_playwright() as p: |
| 565 | + api = await p.request.new_context( |
| 566 | + extra_http_headers={ |
| 567 | + **({"Cookie": cookie_header} if cookie_header else {}), |
| 568 | + "Referer": current_url, |
| 569 | + "User-Agent": "Mozilla/5.0", |
| 570 | + "Accept": "application/pdf,*/*", |
| 571 | + } |
| 572 | + ) |
| 573 | + res = await api.get(href) |
| 574 | + if res.ok: |
| 575 | + body = await res.body() |
| 576 | + resolved_path = replace_data_placeholders(target_path_base, collector) or target_path_base |
| 577 | + await _ensure_dir(resolved_path) |
| 578 | + with open(resolved_path, "wb") as f: |
| 579 | + f.write(body) |
| 580 | + saved_path = resolved_path |
| 581 | + print(f" 📄 PDF saved via scraped href to {resolved_path}") |
| 582 | + await api.dispose() |
| 583 | + saved = True |
| 584 | + break |
| 585 | + await api.dispose() |
| 586 | + except Exception: |
| 587 | + pass |
| 588 | + except Exception: |
| 589 | + pass |
| 590 | + |
| 591 | + if not saved: |
| 592 | + print(" 📄 Viewer download fallback failed.") |
332 | 593 |
|
333 | | - resolved = replace_data_placeholders(step.value, collector) or step.value |
334 | | - await _ensure_dir(resolved) |
335 | | - with open(resolved, "wb") as f: |
336 | | - f.write(pdf_bytes) |
337 | | - saved_path = resolved |
338 | | - print(f" 📄 PDF saved to {resolved}") |
339 | 594 | except Exception as e: |
340 | | - print(f" 📄 PDF save failed: {e}") |
| 595 | + print(f" 📄 savePDF failed: {e}") |
341 | 596 | finally: |
342 | | - collector[key] = saved_path |
| 597 | + collector[collector_key] = saved_path |
343 | 598 |
|
344 | 599 |
|
345 | 600 | async def _handle_download_pdf(page: Page, step: BaseStep, collector: Dict[str, Any]) -> None: |
|
0 commit comments