crawl4ai/tests/test_issue_1455_cache_extraction.py at 3011f0695e87ce4b1c41cfd9ef8070a7972b9e94 · unclecode/crawl4ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Reproduction test for issue #1455:
Extraction strategy is skipped when cache_mode=ENABLED and cache hits.

Uses JsonCssExtractionStrategy (no LLM needed) to verify that extraction
runs on cached HTML, not just on fresh fetches.
"""

import asyncio
import json
import socket
import threading
import time
import pytest
from aiohttp import web

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy


PRODUCT_PAGE = """
<!DOCTYPE html>
<html>
<head><title>Product Catalog - Test Store</title>
<meta name="description" content="Browse our product catalog with great deals">
</head>
<body>
<header><h1>Test Store Product Catalog</h1>
<nav><a href="/">Home</a> <a href="/products">Products</a> <a href="/about">About</a></nav>
</header>
<main>
<p>Welcome to our store. Browse our selection of quality products below.
We offer competitive prices and fast shipping on all orders.</p>
<div class="product" data-testid="product">
  <span class="name">Widget A</span>
  <span class="price">$9.99</span>
  <p>A high-quality widget for everyday use. Built to last with premium materials.</p>
</div>
<div class="product" data-testid="product">
  <span class="name">Widget B</span>
  <span class="price">$19.99</span>
  <p>Our premium widget with advanced features and extended warranty included.</p>
</div>
</main>
<footer><p>Copyright 2026 Test Store. All rights reserved.</p></footer>
</body></html>
"""

SCHEMA = {
    "name": "Products",
    "baseSelector": "div.product[data-testid='product']",
    "fields": [
        {"name": "name", "selector": "span.name", "type": "text"},
        {"name": "price", "selector": "span.price", "type": "text"},
    ],
}


def _find_free_port():
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(("", 0))
        return s.getsockname()[1]


@pytest.fixture(scope="module")
def test_server():
    port = _find_free_port()

    async def handle(request):
        return web.Response(text=PRODUCT_PAGE, content_type="text/html")

    app = web.Application()
    app.router.add_get("/products", handle)

    ready = threading.Event()

    def run():
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        runner = web.AppRunner(app)
        loop.run_until_complete(runner.setup())
        site = web.TCPSite(runner, "localhost", port)
        loop.run_until_complete(site.start())
        ready.set()
        loop.run_forever()

    t = threading.Thread(target=run, daemon=True)
    t.start()
    assert ready.wait(timeout=10)
    time.sleep(0.2)
    yield f"http://localhost:{port}"


@pytest.mark.asyncio
async def test_extraction_runs_on_cache_hit(test_server):
    """
    Bug #1455: extraction strategy must run even when result comes from cache.

    1. First crawl WITHOUT extraction (populates cache)
    2. Second crawl WITH extraction + cache_mode=ENABLED (cache hit)
    3. Verify extracted_content is populated (not empty)
    """
    url = f"{test_server}/products"

    # Step 1: Warm the cache (no extraction strategy)
    config_warm = CrawlerRunConfig(
        cache_mode=CacheMode.ENABLED,
    )
    async with AsyncWebCrawler(verbose=False) as crawler:
        result1 = await crawler.arun(url=url, config=config_warm)
    assert result1.success

    # Step 2: Crawl again WITH extraction strategy (should hit cache)
    extraction = JsonCssExtractionStrategy(SCHEMA)
    config_extract = CrawlerRunConfig(
        cache_mode=CacheMode.ENABLED,
        extraction_strategy=extraction,
    )
    async with AsyncWebCrawler(verbose=False) as crawler:
        result2 = await crawler.arun(url=url, config=config_extract)

    assert result2.success
    data = json.loads(result2.extracted_content)
    assert len(data) == 2, f"Expected 2 products, got {len(data)}"
    assert data[0]["name"] == "Widget A"
    assert data[1]["name"] == "Widget B"


@pytest.mark.asyncio
async def test_cache_without_extraction_still_works(test_server):
    """Cache hit without extraction strategy should still return normally."""
    url = f"{test_server}/products"

    config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
    async with AsyncWebCrawler(verbose=False) as crawler:
        result = await crawler.arun(url=url, config=config)
    assert result.success
    assert "Widget A" in result.html


if __name__ == "__main__":
    asyncio.run(test_extraction_runs_on_cache_hit.__wrapped__(None))