forked from jshemas/openGraphScraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrequest.ts
More file actions
117 lines (106 loc) · 3.71 KB
/
request.ts
File metadata and controls
117 lines (106 loc) · 3.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import { fetch } from 'undici';
import { decode } from 'iconv-lite';
import { CheerioAPI, load } from 'cheerio';
import chardet from 'chardet';
import type { OpenGraphScraperOptions } from './types';
/**
* checks if an element exists
*/
const doesElementExist = (selector:string, attribute:string, $: CheerioAPI) => (
$(selector).attr(attribute) && ($(selector).attr(attribute)?.length ?? 0) > 0
);
/**
* gets the charset of the html
*/
function getCharset(body: string, buffer: ArrayBuffer, $: CheerioAPI) {
if (doesElementExist('meta', 'charset', $)) {
return $('meta').attr('charset');
}
if (doesElementExist('head > meta[name="charset"]', 'content', $)) {
return $('head > meta[name="charset"]').attr('content');
}
if (doesElementExist('head > meta[http-equiv="content-type"]', 'content', $)) {
const content = $('head > meta[http-equiv="content-type"]').attr('content') ?? '';
const charsetRegEx = /charset=([^()<>@,;:"/[\]?.=\s]*)/i;
if (charsetRegEx.test(content)) {
const charsetRegExExec = charsetRegEx.exec(content);
if (charsetRegExExec?.[1]) return charsetRegExExec[1];
}
}
if (body) {
return chardet.detect(Buffer.from(buffer));
}
return 'utf-8';
}
/**
* performs the fetch request and formats the body for ogs
*
* @param {object} options - options for ogs
* @return {object} formatted request body and response
*
*/
export default async function requestAndResultsFormatter(options: OpenGraphScraperOptions) {
let body;
let response;
try {
// eslint-disable-next-line no-control-regex
const isLatin1 = /^[\u0000-\u00ff]{0,}$/;
let url = options.url ?? '';
if (!isLatin1.test(url)) url = encodeURI(url);
response = await fetch(
url ?? '',
{
signal: AbortSignal.timeout((options.timeout ?? 10) * 1000),
...options.fetchOptions,
headers: { Origin: url ?? '', Accept: 'text/html', ...options.fetchOptions?.headers },
},
);
const bodyArrayBuffer = await response.arrayBuffer();
const bodyText = Buffer.from(bodyArrayBuffer).toString('utf-8');
const charset = getCharset(bodyText, bodyArrayBuffer, load(bodyText)) ?? 'utf-8';
if (charset.toLowerCase() === 'utf-8') {
body = bodyText;
} else {
body = decode(Buffer.from(bodyArrayBuffer), charset);
}
const contentType = response?.headers?.get('content-type')?.toLowerCase();
if (contentType && !contentType.includes('text/')) {
throw new Error('Page must return a header content-type with text/');
}
if (response?.status && (response.status.toString().startsWith('4') || response.status.toString().startsWith('5'))) {
switch (response.status) {
case 400:
throw new Error('400 Bad Request');
case 401:
throw new Error('401 Unauthorized');
case 403:
throw new Error('403 Forbidden');
case 404:
throw new Error('404 Not Found');
case 408:
throw new Error('408 Request Timeout');
case 410:
throw new Error('410 Gone');
case 429:
throw new Error('429 Too Many Requests');
case 500:
throw new Error('500 Internal Server Error');
case 502:
throw new Error('502 Bad Gateway');
case 503:
throw new Error('503 Service Unavailable');
case 504:
throw new Error('504 Gateway Timeout');
default:
throw new Error('Server has returned a 400/500 error code');
}
}
if (body === undefined || body === '') {
throw new Error('Page not found');
}
} catch (error) {
if (error instanceof Error && error.message === 'fetch failed') throw error.cause;
throw error;
}
return { body, response };
}