Skip to content

Commit cc58546

Browse files
committed
Add separate extract internals
1 parent c7c2668 commit cc58546

4 files changed

Lines changed: 204 additions & 22 deletions

File tree

examples/page-actions/extract.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/**
2+
* # Extract Example
3+
*
4+
* This example demonstrates how to use HyperAgent with a defined output schema
5+
* to ensure structured and validated responses from the agent.
6+
*
7+
* ## What This Example Does
8+
*
9+
* The agent performs a task with structured output that:
10+
* 1. Defines a Zod schema for the expected output format
11+
* 2. Performs actions to complete the specified task
12+
* 3. Returns movie information in a structured format specified
13+
*
14+
* ## Prerequisites
15+
*
16+
* 1. Node.js environment
17+
* 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
18+
*
19+
* ## Running the Example
20+
*
21+
* ```bash
22+
* yarn ts-node -r tsconfig-paths/register examples/output-to-schema/output-to-schema.ts
23+
* ```
24+
*/
25+
26+
import "dotenv/config";
27+
import { HyperAgent } from "@hyperbrowser/agent";
28+
29+
import chalk from "chalk";
30+
import { ChatOpenAI } from "@langchain/openai";
31+
import { z } from "zod";
32+
33+
async function runEval() {
34+
const llm = new ChatOpenAI({
35+
apiKey: process.env.OPENAI_API_KEY,
36+
model: "gpt-4o",
37+
});
38+
39+
const agent = new HyperAgent({
40+
llm: llm,
41+
debug: true,
42+
});
43+
44+
const page = await agent.newPage();
45+
await page.goto("https://www.imdb.com/title/tt0133093/");
46+
47+
const result = await page.extract(
48+
"extract the director, release year, and rating",
49+
z.object({
50+
director: z.array(z.string().describe("The name of the movie director")),
51+
releaseYear: z.number().describe("The year the movie was released"),
52+
rating: z.string().describe("The IMDb rating of the movie"),
53+
})
54+
);
55+
56+
await agent.closeAgent();
57+
console.log(chalk.green.bold("\nResult:"));
58+
console.log(chalk.white(JSON.stringify(result, null, 2)));
59+
return result;
60+
}
61+
62+
(async () => {
63+
await runEval();
64+
})().catch((error) => {
65+
console.error(chalk.red("Error:"), error);
66+
process.exit(1);
67+
});

src/agent/actions/extract.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export const ExtractAction = z
1313

1414
export type ExtractActionType = z.infer<typeof ExtractAction>;
1515

16-
export const ExtractActionDefinition: AgentActionDefinition = {
16+
export const ExtractActionDefinition: AgentActionDefinition<typeof ExtractAction> = {
1717
type: "extract" as const,
1818
actionParams: ExtractAction,
1919
run: async (

src/agent/index.ts

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import { runAgentTask } from "./tools/agent";
3434
import { HyperPage, HyperVariable } from "@/types/agent/types";
3535
import { z } from "zod";
3636
import { ErrorEmitter } from "@/utils";
37+
import { PageExtractFn } from "./tools/page-actions/extract";
3738

3839
export class HyperAgent<T extends BrowserProviders = "Local"> {
3940
private llm: BaseChatModel;
@@ -570,27 +571,13 @@ export class HyperAgent<T extends BrowserProviders = "Local"> {
570571
400
571572
);
572573
}
573-
if (task) {
574-
const res = await this.executeTask(
575-
`You have to perform an extraction on the current page. You have to perform the extraction according to the task: ${task}. Make sure your final response only contains the extracted content`,
576-
{
577-
maxSteps: 2,
578-
outputSchema,
579-
},
580-
page
581-
);
582-
if (outputSchema) {
583-
return JSON.parse(res.output as string);
584-
}
585-
return res.output as string;
586-
} else {
587-
const res = await this.executeTask(
588-
"You have to perform a data extraction on the current page. Make sure your final response only contains the extracted content",
589-
{ maxSteps: 2, outputSchema },
590-
page
591-
);
592-
return JSON.parse(res.output as string);
593-
}
574+
return await PageExtractFn({
575+
task,
576+
schema: outputSchema,
577+
page,
578+
llm: this.llm,
579+
tokenLimit: this.tokenLimit,
580+
});
594581
};
595582
return hyperPage;
596583
}
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import { z } from "zod";
2+
import { parseMarkdown } from "@/utils/html-to-markdown";
3+
import { Page } from "playwright";
4+
import { BaseChatModel } from "@langchain/core/language_models/chat_models";
5+
import { HumanMessage, SystemMessage } from "@langchain/core/messages";
6+
7+
export interface ExtractOptions<
8+
T extends z.AnyZodObject | undefined = z.AnyZodObject,
9+
> {
10+
schema?: T;
11+
task?: string;
12+
page: Page;
13+
llm: BaseChatModel;
14+
tokenLimit?: number;
15+
}
16+
17+
export async function PageExtractFn<
18+
T extends z.AnyZodObject | undefined = z.AnyZodObject,
19+
>({
20+
schema,
21+
task,
22+
page,
23+
llm,
24+
tokenLimit = 4000,
25+
}: ExtractOptions<T>): Promise<T extends z.AnyZodObject ? z.infer<T> : string> {
26+
if (!schema && !task) {
27+
throw new Error("Either schema or task must be provided");
28+
}
29+
30+
// Get page content and convert to markdown
31+
const content = await page.content();
32+
const markdown = await parseMarkdown(content);
33+
34+
// Get page metadata
35+
const metadata = await page.evaluate(() => {
36+
const meta = {
37+
title: document.title,
38+
description:
39+
document
40+
.querySelector('meta[name="description"]')
41+
?.getAttribute("content") || "",
42+
keywords:
43+
document
44+
.querySelector('meta[name="keywords"]')
45+
?.getAttribute("content") || "",
46+
ogTitle:
47+
document
48+
.querySelector('meta[property="og:title"]')
49+
?.getAttribute("content") || "",
50+
ogDescription:
51+
document
52+
.querySelector('meta[property="og:description"]')
53+
?.getAttribute("content") || "",
54+
ogImage:
55+
document
56+
.querySelector('meta[property="og:image"]')
57+
?.getAttribute("content") || "",
58+
canonicalUrl:
59+
document.querySelector('link[rel="canonical"]')?.getAttribute("href") ||
60+
"",
61+
};
62+
return meta;
63+
});
64+
65+
// TODO: Maybe take fullscreen screenshots here, and then break them up into manageable chunks usable by the LLM.
66+
// Take screenshot for context
67+
const cdpSession = await page.context().newCDPSession(page);
68+
const screenshot = await cdpSession.send("Page.captureScreenshot");
69+
cdpSession.detach();
70+
71+
// TODO: Maybe use js-tiktoken here ?
72+
// Trim markdown to stay within token limit
73+
const avgTokensPerChar = 0.75;
74+
const maxChars = Math.floor(tokenLimit / avgTokensPerChar);
75+
const trimmedMarkdown =
76+
markdown.length > maxChars
77+
? markdown.slice(0, maxChars) + "\n[Content truncated due to length]"
78+
: markdown;
79+
80+
// Create messages
81+
const messages = [
82+
new SystemMessage(
83+
`You are an expert at extracting structured information from web pages. Your task is to:
84+
1. Analyze the provided markdown content, metadata, and screenshot of a webpage
85+
2. Extract relevant information based on the provided task and schema (if any)
86+
3. Pay attention to both the text content and visual layout
87+
4. Handle cases where information might be split across different sections
88+
5. Ensure the response is complete and accurate
89+
6. Format the response appropriately based on the schema (if provided)
90+
91+
Remember to:
92+
- Look for information in both the main content and page metadata (title, description, etc.)
93+
- Consider the visual hierarchy and layout of the page
94+
- Handle cases where information might be ambiguous or incomplete
95+
- Ensure the response is complete and accurate`
96+
),
97+
new HumanMessage({
98+
content: [
99+
{
100+
type: "text",
101+
text: `Extract information from the page${task ? ` according to this task: ${task}` : ""}${schema ? " and format according to the schema" : ""}`,
102+
},
103+
{ type: "text", text: "Here is the page metadata:" },
104+
{ type: "text", text: JSON.stringify(metadata, null, 2) },
105+
{ type: "text", text: "Here is the page content:" },
106+
{ type: "text", text: trimmedMarkdown },
107+
{ type: "text", text: "Here is a screenshot of the page:" },
108+
{
109+
type: "image_url",
110+
image_url: {
111+
url: `data:image/png;base64,${screenshot.data}`,
112+
},
113+
},
114+
],
115+
}),
116+
];
117+
118+
if (schema) {
119+
// Create structured output chain
120+
const chain = llm.withStructuredOutput(schema);
121+
const result = await chain.invoke(messages);
122+
return result as T extends z.AnyZodObject ? z.infer<T> : string;
123+
} else {
124+
// For task-based extraction, get raw response
125+
const response = await llm.invoke(messages);
126+
return response.content as T extends z.AnyZodObject ? z.infer<T> : string;
127+
}
128+
}

0 commit comments

Comments
 (0)