-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathembeddings-transformers-noc.js
More file actions
79 lines (62 loc) · 2.11 KB
/
embeddings-transformers-noc.js
File metadata and controls
79 lines (62 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import * as fs from 'fs';
import { pipeline } from '@xenova/transformers';
// Load the embeddings model
const extractor = await pipeline('feature-extraction', 'Xenova/bge-small-en-v1.5');
const fullOutput = [];
(async () => {
// Scan transcripts directory for all json files
const files = fs.readdirSync('transcripts/markdown');
// Iterate through each file and calculate the embeddings
for (const file of files) {
const text = fs.readFileSync(`transcripts/markdown/${file}`, 'utf-8');
// const json = JSON.parse(rawContents);
// Calculate chunks based on this text
const chunks = calculateMarkdownChunks(text);
// Extract embeddings for each chunk
const output = [];
for (const chunk of chunks) {
const embeddingOutput = await extractor(chunk, {
pooling: 'mean',
normalize: true,
});
const embedding = embeddingOutput.tolist()[0];
output.push({ text: chunk, embedding });
fullOutput.push({ text: chunk, embedding });
}
// Save the embeddings to a file
const fileOut = `embeddings/${file.replace('.md', '.json')}`;
fs.writeFileSync(fileOut, JSON.stringify(output));
console.log(
`Embeddings saved for ${file} to ${fileOut} (${output.length} chunks) (${
files.indexOf(file) + 1
}/${files.length})`
);
}
// Save the full output to a single file
const fileOut = `embeddings.json`;
fs.writeFileSync(fileOut, JSON.stringify(fullOutput));
console.log(`Complete embeddings saved to ${fileOut}`);
})();
function calculateMarkdownChunks(text) {
const chunks = [];
const lines = text.split('\n');
let chunk = '';
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
// Check if the line is a header (starts with #)
if (line.startsWith('#')) {
// If we have accumulated a chunk, push it before starting a new one
if (chunk) {
chunks.push(chunk.trim());
chunk = '';
}
}
// Add the line to the current chunk
chunk += line + '\n';
}
// Push the last chunk if any
if (chunk) {
chunks.push(chunk.trim());
}
return chunks;
}