Skip to content

Commit 92dd436

Browse files
authored
Add PDF file support in full DOM fetcher (#1232)
2 parents ef19240 + be5af6c commit 92dd436

3 files changed

Lines changed: 189 additions & 49 deletions

File tree

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22

33
All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
44

5+
## Unreleased [minor]
6+
7+
> Development of this release was supported by [Reset Tech](https://www.reset.tech).
8+
9+
### Added
10+
11+
- Add PDF file support in full DOM fetcher
12+
513
## 10.4.0 - 2026-01-19
614

715
> Development of this release was supported by [Reset Tech](https://www.reset.tech).

src/archivist/fetcher/fullDomFetcher.js

Lines changed: 149 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -8,69 +8,76 @@ let browser;
88
export default async function fetch(url, cssSelectors, config) {
99
puppeteer.use(stealthPlugin({ locale: config.language }));
1010

11-
let context;
12-
let page;
13-
let client;
14-
let response;
15-
const selectors = [].concat(cssSelectors);
16-
1711
if (!browser) {
1812
throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".');
1913
}
2014

15+
let context;
16+
let page;
17+
let client;
18+
2119
try {
2220
context = await browser.createBrowserContext(); // Create an isolated browser context to ensure complete isolation between fetches (cookies, localStorage, sessionStorage, IndexedDB, cache)
2321
page = await context.newPage();
22+
client = await page.createCDPSession();
2423

25-
await page.setViewport({ width: 1920, height: 1080 }); // Set a realistic viewport size to avoid detection based on default Puppeteer dimensions (800x600)
26-
await page.setDefaultNavigationTimeout(config.navigationTimeout);
27-
await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
24+
await configurePage(page, client, config);
2825

29-
// Use CDP to ensure the browser language is set correctly (most reliable method, see https://zirkelc.dev/posts/puppeteer-language-experiment)
30-
client = await page.createCDPSession();
26+
const selectors = [].concat(cssSelectors).filter(Boolean);
3127

32-
await client.send('Network.setUserAgentOverride', {
33-
userAgent: await browser.userAgent(),
34-
acceptLanguage: config.language,
35-
});
28+
let pdf = {};
29+
let handled = null;
30+
31+
if (!selectors.length) { // CSS selectors are specified only for HTML content and omitted when fetching a PDF
32+
({ pdf, handled } = setupPdfInterception(client));
33+
}
3634

37-
if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
38-
await page.authenticate(browser.proxyCredentials);
35+
let response;
36+
let navigationAborted = false;
37+
38+
try {
39+
response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
40+
} catch (error) {
41+
if (error.message.includes('net::ERR_ABORTED')) {
42+
// Chrome may sometimes abort navigation for files such as PDFs.
43+
// Do not throw for now; wait for the PDF interception handler to finish processing the response.
44+
navigationAborted = true;
45+
} else {
46+
throw error;
47+
}
3948
}
4049

41-
response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
50+
// PDF interception handling
51+
if (handled) {
52+
await handled; // Wait for the interception callback to finish processing the response
53+
54+
if (pdf.content) {
55+
return {
56+
mimeType: 'application/pdf',
57+
content: pdf.content,
58+
};
59+
}
60+
61+
if (pdf.status) { // Status captured by CDP interception
62+
throw new Error(`Received HTTP code ${pdf.status} when trying to fetch '${url}'`);
63+
}
64+
}
65+
66+
if (navigationAborted) {
67+
throw new Error(`Navigation aborted when trying to fetch '${url}'`);
68+
}
4269

4370
if (!response) {
4471
throw new Error(`Response is empty when trying to fetch '${url}'`);
4572
}
4673

4774
const statusCode = response.status();
4875

49-
if (statusCode < 200 || (statusCode >= 300 && statusCode !== 304)) {
76+
if (!isValidHttpStatus(statusCode)) {
5077
throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
5178
}
5279

53-
const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
54-
page.waitForFunction(
55-
cssSelector => {
56-
const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
57-
58-
return element?.textContent.trim().length; // Ensures element exists and contains non-empty text, as an empty element may indicate content is still loading
59-
},
60-
{ timeout: config.waitForElementsTimeout },
61-
selector,
62-
));
63-
64-
// We expect all elements to be present on the page…
65-
await Promise.all(waitForSelectorsPromises).catch(error => {
66-
if (error.name == 'TimeoutError') {
67-
// however, if they are not, this is not considered as an error since selectors may be out of date
68-
// and the whole content of the page should still be returned.
69-
return;
70-
}
71-
72-
throw error;
73-
});
80+
await waitForSelectors(page, selectors, config.waitForElementsTimeout);
7481

7582
return {
7683
mimeType: 'text/html',
@@ -80,17 +87,10 @@ export default async function fetch(url, cssSelectors, config) {
8087
if (error.name === 'TimeoutError') {
8188
throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
8289
}
90+
8391
throw new Error(error.message);
8492
} finally {
85-
if (client) {
86-
await client.detach();
87-
}
88-
if (page) {
89-
await page.close();
90-
}
91-
if (context) {
92-
await context.close(); // Close the isolated context to free resources and ensure complete cleanup
93-
}
93+
await cleanupPage(client, page, context);
9494
}
9595
}
9696

@@ -151,3 +151,103 @@ export async function stopHeadlessBrowser() {
151151
await browser.close();
152152
browser = null;
153153
}
154+
155+
function isValidHttpStatus(status) {
156+
return (status >= 200 && status < 300) || status === 304;
157+
}
158+
159+
async function configurePage(page, client, config) {
160+
await page.setViewport({ width: 1920, height: 1080 }); // Realistic viewport to avoid detection based on default Puppeteer dimensions (800x600)
161+
await page.setDefaultNavigationTimeout(config.navigationTimeout);
162+
await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
163+
164+
// Use CDP to ensure browser language is set correctly (see https://zirkelc.dev/posts/puppeteer-language-experiment)
165+
await client.send('Network.setUserAgentOverride', {
166+
userAgent: await browser.userAgent(),
167+
acceptLanguage: config.language,
168+
});
169+
170+
if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
171+
await page.authenticate(browser.proxyCredentials);
172+
}
173+
}
174+
175+
function setupPdfInterception(client) {
176+
const pdf = { content: null, status: null };
177+
let onHandled;
178+
const handled = new Promise(resolve => { onHandled = resolve; });
179+
180+
client.send('Fetch.enable', { patterns: [{ urlPattern: '*', requestStage: 'Response' }] }); // Intercept all responses before Chrome processes them, allowing to capture PDF content before it's handled by the PDF viewer
181+
182+
client.on('Fetch.requestPaused', async ({ requestId, resourceType, responseHeaders, responseStatusCode }) => {
183+
try {
184+
const contentType = responseHeaders?.find(header => header.name.toLowerCase() === 'content-type')?.value;
185+
186+
if (!contentType?.includes('application/pdf')) {
187+
return;
188+
}
189+
190+
pdf.status = responseStatusCode;
191+
192+
if (!isValidHttpStatus(responseStatusCode)) {
193+
return;
194+
}
195+
196+
try {
197+
const { body, base64Encoded } = await client.send('Fetch.getResponseBody', { requestId });
198+
199+
pdf.content = Buffer.from(body, base64Encoded ? 'base64' : 'utf8');
200+
} catch {
201+
// Response body may be unavailable due to network error or connection interruption
202+
}
203+
} finally {
204+
try {
205+
await client.send('Fetch.continueResponse', { requestId });
206+
} catch {
207+
// Client may have been closed by cleanupPage() in fetch() while this async callback was still running
208+
}
209+
210+
if (resourceType === 'Document') { // Signal that the main navigation request has been processed
211+
onHandled();
212+
}
213+
}
214+
});
215+
216+
return { pdf, handled };
217+
}
218+
219+
async function waitForSelectors(page, selectors, timeout) {
220+
const waitForSelectorsPromises = selectors.filter(Boolean).map(selector =>
221+
page.waitForFunction(
222+
cssSelector => {
223+
const element = document.querySelector(cssSelector); // eslint-disable-line no-undef
224+
225+
return element?.textContent.trim().length; // Ensures element exists and has non-empty text
226+
},
227+
{ timeout },
228+
selector,
229+
));
230+
231+
// We expect all elements to be present on the page…
232+
await Promise.all(waitForSelectorsPromises).catch(error => {
233+
if (error.name == 'TimeoutError') {
234+
// however, if they are not, this is not considered as an error since selectors may be out of date
235+
// and the whole content of the page should still be returned.
236+
return;
237+
}
238+
239+
throw error;
240+
});
241+
}
242+
243+
async function cleanupPage(client, page, context) {
244+
if (client) {
245+
await client.detach().catch(() => {});
246+
}
247+
if (page) {
248+
await page.close().catch(() => {});
249+
}
250+
if (context) {
251+
await context.close().catch(() => {}); // Close the isolated context to free resources and ensure complete cleanup
252+
}
253+
}

src/archivist/fetcher/fullDomFetcher.test.js

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
1+
import fs from 'fs';
12
import http from 'http';
3+
import path from 'path';
4+
import { fileURLToPath } from 'url';
25

36
import { expect, use } from 'chai';
47
import chaiAsPromised from 'chai-as-promised';
58

69
import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from './fullDomFetcher.js';
710

11+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
12+
813
const SERVER_PORT = 8977;
914

1015
use(chaiAsPromised);
@@ -16,6 +21,7 @@ describe('Full DOM Fetcher', function () {
1621
this.timeout(60000);
1722

1823
let temporaryServer;
24+
let expectedPDFContent;
1925

2026
before(async () => {
2127
await launchHeadlessBrowser();
@@ -27,6 +33,10 @@ describe('Full DOM Fetcher', function () {
2733
if (request.url === '/delayed-content') {
2834
response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML);
2935
}
36+
if (request.url === '/terms.pdf') {
37+
expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
38+
response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent);
39+
}
3040

3141
return response.end();
3242
}).listen(SERVER_PORT);
@@ -85,5 +95,27 @@ describe('Full DOM Fetcher', function () {
8595
await expect(fetch(url, ['.content'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
8696
});
8797
});
98+
99+
context('when URL targets a PDF file', () => {
100+
let content;
101+
let mimeType;
102+
const pdfUrl = `http://127.0.0.1:${SERVER_PORT}/terms.pdf`;
103+
104+
before(async () => {
105+
({ content, mimeType } = await fetch(pdfUrl, [], config));
106+
});
107+
108+
it('returns a buffer for PDF content', () => {
109+
expect(content).to.be.an.instanceOf(Buffer);
110+
});
111+
112+
it('returns the correct MIME type', () => {
113+
expect(mimeType).to.equal('application/pdf');
114+
});
115+
116+
it('returns the PDF file content', () => {
117+
expect(content.equals(expectedPDFContent)).to.be.true;
118+
});
119+
});
88120
});
89121
});

0 commit comments

Comments
 (0)