-
Notifications
You must be signed in to change notification settings - Fork 160
Expand file tree
/
Copy pathextract.ts
More file actions
93 lines (86 loc) · 3.02 KB
/
extract.ts
File metadata and controls
93 lines (86 loc) · 3.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import { z } from "zod";
import { ActionContext, ActionOutput, AgentActionDefinition } from "@/types";
import { parseMarkdown } from "@/utils/html-to-markdown";
import fs from "fs";
export const ExtractAction = z
.object({
objective: z.string().describe("The goal of the extraction."),
})
.describe(
"Extract content from the page according to the objective, e.g. product prices, contact information, article text, table data, or specific metadata fields"
)
export type ExtractActionType = z.infer<typeof ExtractAction>;
export const ExtractActionDefinition: AgentActionDefinition<typeof ExtractAction> = {
type: "extract" as const,
actionParams: ExtractAction,
run: async (
ctx: ActionContext,
action: ExtractActionType
): Promise<ActionOutput> => {
try {
const content = await ctx.page.content();
const markdown = await parseMarkdown(content);
const objective = action.objective;
// Take a screenshot of the page
const cdpSession = await ctx.page.context().newCDPSession(ctx.page);
const screenshot = await cdpSession.send("Page.captureScreenshot");
cdpSession.detach();
// Save screenshot to debug dir if exists
if (ctx.debugDir) {
fs.writeFileSync(
`${ctx.debugDir}/extract-screenshot.png`,
Buffer.from(screenshot.data, "base64")
);
}
// Trim markdown to stay within token limit
// TODO: this is a hack, we should use a better token counting method
const avgTokensPerChar = 0.75; // Conservative estimate of tokens per character
const maxChars = Math.floor(ctx.tokenLimit / avgTokensPerChar);
const trimmedMarkdown =
markdown.length > maxChars
? markdown.slice(0, maxChars) + "\n[Content truncated due to length]"
: markdown;
if (ctx.debugDir) {
fs.writeFileSync(
`${ctx.debugDir}/extract-markdown-content.md`,
trimmedMarkdown
);
}
const response = await ctx.llm.invoke([
{
role: "user",
content: [
{
type: "text",
text: `Extract the following information from the page according to this objective: "${objective}"\n\nPage content:\n${trimmedMarkdown}\nHere is as screenshot of the page:\n`,
},
{
type: "image_url",
image_url: {
url: `data:image/png;base64,${screenshot.data}`,
},
},
],
},
]);
if (response.content.length === 0) {
return {
success: false,
message: `No content extracted from page.`,
};
}
return {
success: true,
message: `Extracted content from page:\n${response.content}`,
};
} catch (error) {
return {
success: false,
message: `Failed to extract content: ${error}`,
};
}
},
pprintAction: function(params: ExtractActionType): string {
return `Extract content from page with objective: "${params.objective}"`;
},
};