-
Notifications
You must be signed in to change notification settings - Fork 66.9k
Expand file tree
/
Copy pathdomwaiter.ts
More file actions
97 lines (84 loc) · 2.58 KB
/
domwaiter.ts
File metadata and controls
97 lines (84 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import { EventEmitter } from 'events'
import Bottleneck from 'bottleneck'
import { fetchWithRetry } from '@/frame/lib/fetch-utils'
import cheerio from 'cheerio'
import type { Permalink } from '@/search/scripts/scrape/types'
// Custom error class to match got's HTTPError interface
class HTTPError extends Error {
response: { ok: boolean; statusCode?: number }
request: { requestUrl?: { pathname?: string } }
constructor(
message: string,
response: { ok: boolean; statusCode?: number },
request: { requestUrl?: { pathname?: string } },
) {
super(message)
this.name = 'HTTPError'
this.response = response
this.request = request
}
}
interface DomWaiterOptions {
parseDOM?: boolean
json?: boolean
maxConcurrent?: number
minTime?: number
}
export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {}): EventEmitter {
const emitter = new EventEmitter()
const defaults = {
parseDOM: true,
json: false,
maxConcurrent: 5,
minTime: 500,
}
opts = Object.assign(defaults, opts)
const limiter = new Bottleneck(opts)
pages.forEach((page) => {
limiter.schedule(() => getPage(page, emitter, opts))
})
limiter.on('idle', () => {
emitter.emit('done')
})
limiter.on('error', (err) => {
emitter.emit('error', err)
})
return emitter
}
async function getPage(page: Permalink, emitter: EventEmitter, opts: DomWaiterOptions) {
emitter.emit('beforePageLoad', page)
if (opts.json) {
try {
const response = await fetchWithRetry(page.url!, undefined, { retries: 3 })
if (!response.ok) {
throw new HTTPError(
`HTTP ${response.status}: ${response.statusText}`,
{ ok: response.ok, statusCode: response.status },
{ requestUrl: { pathname: page.url } },
)
}
const json = await response.json()
const pageCopy = Object.assign({}, page, { json })
emitter.emit('page', pageCopy)
} catch (err) {
emitter.emit('error', err)
}
} else {
try {
const response = await fetchWithRetry(page.url!, undefined, { retries: 3 })
if (!response.ok) {
throw new HTTPError(
`HTTP ${response.status}: ${response.statusText}`,
{ ok: response.ok, statusCode: response.status },
{ requestUrl: { pathname: page.url } },
)
}
const body = await response.text()
const pageCopy = Object.assign({}, page, { body })
if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body)
emitter.emit('page', pageCopy)
} catch (err) {
emitter.emit('error', err)
}
}
}