-
-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathlarge-file.test.ts
More file actions
77 lines (60 loc) · 2.68 KB
/
large-file.test.ts
File metadata and controls
77 lines (60 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import { PDFParse } from 'pdf-parse';
import { describe, expect, test } from 'vitest';
// https://ntrs.nasa.gov/search
// https://op.europa.eu/en/web/general-publications/publications
// https://www.ipcc.ch/report/ar6/syr/?spm=a2ty_o01.29997173.0.0.bd03c921juVCxB
describe('Large PDF File Tests', () => {
test('should parse climate report', { timeout: 30000 }, async () => {
// 242 MB
// https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_Full_Report.pdf?spm=a2ty_o01.29997173.0.0.bd03c921juVCxB&file=IPCC_AR6_WGI_Full_Report.pdf
const url = 'https://mehmet-kozan.github.io/pdf-parse/pdf/large-file.pdf';
const parser = new PDFParse({
url,
// Disable automatically download additional pages.
// If true, will not auto-fetch pages of the file.
// It is also necessary to disable streaming for disabling of pre-fetching to work correctly.
disableAutoFetch: true,
// Disables streaming-based loading. When true, streaming is not used and data is fetched in chunks.
// Often used with range requests to get predictable partial-download behavior.
disableStream: true,
// Number of bytes requested per range request.
// Smaller values -> more requests; larger values -> fewer, bigger requests.
// Example: 131072 = 128 KB
// Default: 65536 (= 2^16)
rangeChunkSize: 65536,
});
const result = await parser.getText({
first: 3, // Only first 3 pages
});
await parser.destroy();
expect(result.pages.length).toBeLessThanOrEqual(3);
expect(result.text.length).toBeGreaterThan(0);
expect(parser.progress.total).toBeGreaterThan(parser.progress.loaded);
});
test('should handle arxiv.org research paper', { timeout: 30000 }, async () => {
const parser = new PDFParse({
url: 'https://mehmet-kozan.github.io/pdf-parse/pdf/large-file.pdf',
disableAutoFetch: true,
disableStream: true,
rangeChunkSize: 65536,
});
const result = await parser.getText({
last: 1,
});
await parser.destroy();
expect(result.text).toContain('EDUCATION ON THE MOVE');
expect(result.pages.length).toBeLessThanOrEqual(1);
expect(parser.progress.total).toBeGreaterThan(parser.progress.loaded);
});
test('should parse Bitcoin Whitepaper (small but reliable)', { timeout: 30000 }, async () => {
// Bitcoin Whitepaper - ~180 KB (small but good for testing)
// https://bitcoin.org/bitcoin.pdf
const url = 'https://mehmet-kozan.github.io/pdf-parse/pdf/bitcoin.pdf';
const parser = new PDFParse({ url });
const result = await parser.getText();
await parser.destroy();
expect(result.text).toContain('Bitcoin');
expect(result.text).toContain('Satoshi Nakamoto');
expect(parser.progress.loaded).toBeGreaterThanOrEqual(parser.progress.total);
});
});