pdf-parse/tests/unit/test-example/large-file.test.ts at main · mehmet-kozan/pdf-parse · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import { PDFParse } from 'pdf-parse';
import { describe, expect, test } from 'vitest';

// https://ntrs.nasa.gov/search
// https://op.europa.eu/en/web/general-publications/publications
// https://www.ipcc.ch/report/ar6/syr/?spm=a2ty_o01.29997173.0.0.bd03c921juVCxB

describe('Large PDF File Tests', () => {
	test('should parse climate report', { timeout: 30000 }, async () => {
		// 242 MB
		// https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_Full_Report.pdf?spm=a2ty_o01.29997173.0.0.bd03c921juVCxB&file=IPCC_AR6_WGI_Full_Report.pdf
		const url = 'https://mehmet-kozan.github.io/pdf-parse/pdf/large-file.pdf';

		const parser = new PDFParse({
			url,

			// Disable automatically download additional pages.
			// If true, will not auto-fetch pages of the file.
			// It is also necessary to disable streaming for disabling of pre-fetching to work correctly.
			disableAutoFetch: true,

			// Disables streaming-based loading. When true, streaming is not used and data is fetched in chunks.
			// Often used with range requests to get predictable partial-download behavior.
			disableStream: true,

			// Number of bytes requested per range request.
			// Smaller values -> more requests; larger values -> fewer, bigger requests.
			// Example: 131072 = 128 KB
			// Default: 65536 (= 2^16)
			rangeChunkSize: 65536,
		});

		const result = await parser.getText({
			first: 3, // Only first 3 pages
		});

		await parser.destroy();

		expect(result.pages.length).toBeLessThanOrEqual(3);
		expect(result.text.length).toBeGreaterThan(0);
		expect(parser.progress.total).toBeGreaterThan(parser.progress.loaded);
	});

	test('should handle arxiv.org research paper', { timeout: 30000 }, async () => {
		const parser = new PDFParse({
			url: 'https://mehmet-kozan.github.io/pdf-parse/pdf/large-file.pdf',
			disableAutoFetch: true,
			disableStream: true,
			rangeChunkSize: 65536,
		});

		const result = await parser.getText({
			last: 1,
		});

		await parser.destroy();

		expect(result.text).toContain('EDUCATION ON THE MOVE');
		expect(result.pages.length).toBeLessThanOrEqual(1);
		expect(parser.progress.total).toBeGreaterThan(parser.progress.loaded);
	});

	test('should parse Bitcoin Whitepaper (small but reliable)', { timeout: 30000 }, async () => {
		// Bitcoin Whitepaper - ~180 KB (small but good for testing)
		// https://bitcoin.org/bitcoin.pdf
		const url = 'https://mehmet-kozan.github.io/pdf-parse/pdf/bitcoin.pdf';

		const parser = new PDFParse({ url });

		const result = await parser.getText();
		await parser.destroy();

		expect(result.text).toContain('Bitcoin');
		expect(result.text).toContain('Satoshi Nakamoto');
		expect(parser.progress.loaded).toBeGreaterThanOrEqual(parser.progress.total);
	});
});