Add separate extract internals

AHarmlessPyro · AHarmlessPyro · commit cc58546b4aed · 2025-05-11T17:35:46.000-05:00
diff --git a/examples/page-actions/extract.ts b/examples/page-actions/extract.ts
@@ -0,0 +1,67 @@
+/**
+ * # Extract Example
+ *
+ * This example demonstrates how to use HyperAgent with a defined output schema
+ * to ensure structured and validated responses from the agent.
+ *
+ * ## What This Example Does
+ *
+ * The agent performs a task with structured output that:
+ * 1. Defines a Zod schema for the expected output format
+ * 2. Performs actions to complete the specified task
+ * 3. Returns movie information in a structured format specified
+ *
+ * ## Prerequisites
+ *
+ * 1. Node.js environment
+ * 2. OpenAI API key set in your .env file (OPENAI_API_KEY)
+ *
+ * ## Running the Example
+ *
+ * ```bash
+ * yarn ts-node -r tsconfig-paths/register examples/output-to-schema/output-to-schema.ts
+ * ```
+ */
+
+import "dotenv/config";
+import { HyperAgent } from "@hyperbrowser/agent";
+
+import chalk from "chalk";
+import { ChatOpenAI } from "@langchain/openai";
+import { z } from "zod";
+
+async function runEval() {
+  const llm = new ChatOpenAI({
+    apiKey: process.env.OPENAI_API_KEY,
+    model: "gpt-4o",
+  });
+
+  const agent = new HyperAgent({
+    llm: llm,
+    debug: true,
+  });
+
+  const page = await agent.newPage();
+  await page.goto("https://www.imdb.com/title/tt0133093/");
+
+  const result = await page.extract(
+    "extract the director, release year, and rating",
+    z.object({
+      director: z.array(z.string().describe("The name of the movie director")),
+      releaseYear: z.number().describe("The year the movie was released"),
+      rating: z.string().describe("The IMDb rating of the movie"),
+    })
+  );
+
+  await agent.closeAgent();
+  console.log(chalk.green.bold("\nResult:"));
+  console.log(chalk.white(JSON.stringify(result, null, 2)));
+  return result;
+}
+
+(async () => {
+  await runEval();
+})().catch((error) => {
+  console.error(chalk.red("Error:"), error);
+  process.exit(1);
+});
diff --git a/src/agent/actions/extract.ts b/src/agent/actions/extract.ts
@@ -13,7 +13,7 @@ export const ExtractAction = z
 
 export type ExtractActionType = z.infer<typeof ExtractAction>;
 
-export const ExtractActionDefinition: AgentActionDefinition = {
+export const ExtractActionDefinition: AgentActionDefinition<typeof ExtractAction> = {
   type: "extract" as const,
   actionParams: ExtractAction,
   run: async (
diff --git a/src/agent/index.ts b/src/agent/index.ts
@@ -34,6 +34,7 @@ import { runAgentTask } from "./tools/agent";
 import { HyperPage, HyperVariable } from "@/types/agent/types";
 import { z } from "zod";
 import { ErrorEmitter } from "@/utils";
+import { PageExtractFn } from "./tools/page-actions/extract";
 
 export class HyperAgent<T extends BrowserProviders = "Local"> {
   private llm: BaseChatModel;
@@ -570,27 +571,13 @@ export class HyperAgent<T extends BrowserProviders = "Local"> {
           400
         );
       }
-      if (task) {
-        const res = await this.executeTask(
-          `You have to perform an extraction on the current page. You have to perform the extraction according to the task: ${task}. Make sure your final response only contains the extracted content`,
-          {
-            maxSteps: 2,
-            outputSchema,
-          },
-          page
-        );
-        if (outputSchema) {
-          return JSON.parse(res.output as string);
-        }
-        return res.output as string;
-      } else {
-        const res = await this.executeTask(
-          "You have to perform a data extraction on the current page. Make sure your final response only contains the extracted content",
-          { maxSteps: 2, outputSchema },
-          page
-        );
-        return JSON.parse(res.output as string);
-      }
+      return await PageExtractFn({
+        task,
+        schema: outputSchema,
+        page,
+        llm: this.llm,
+        tokenLimit: this.tokenLimit,
+      });
     };
     return hyperPage;
   }
diff --git a/src/agent/tools/page-actions/extract.ts b/src/agent/tools/page-actions/extract.ts
@@ -0,0 +1,128 @@
+import { z } from "zod";
+import { parseMarkdown } from "@/utils/html-to-markdown";
+import { Page } from "playwright";
+import { BaseChatModel } from "@langchain/core/language_models/chat_models";
+import { HumanMessage, SystemMessage } from "@langchain/core/messages";
+
+export interface ExtractOptions<
+  T extends z.AnyZodObject | undefined = z.AnyZodObject,
+> {
+  schema?: T;
+  task?: string;
+  page: Page;
+  llm: BaseChatModel;
+  tokenLimit?: number;
+}
+
+export async function PageExtractFn<
+  T extends z.AnyZodObject | undefined = z.AnyZodObject,
+>({
+  schema,
+  task,
+  page,
+  llm,
+  tokenLimit = 4000,
+}: ExtractOptions<T>): Promise<T extends z.AnyZodObject ? z.infer<T> : string> {
+  if (!schema && !task) {
+    throw new Error("Either schema or task must be provided");
+  }
+
+  // Get page content and convert to markdown
+  const content = await page.content();
+  const markdown = await parseMarkdown(content);
+
+  // Get page metadata
+  const metadata = await page.evaluate(() => {
+    const meta = {
+      title: document.title,
+      description:
+        document
+          .querySelector('meta[name="description"]')
+          ?.getAttribute("content") || "",
+      keywords:
+        document
+          .querySelector('meta[name="keywords"]')
+          ?.getAttribute("content") || "",
+      ogTitle:
+        document
+          .querySelector('meta[property="og:title"]')
+          ?.getAttribute("content") || "",
+      ogDescription:
+        document
+          .querySelector('meta[property="og:description"]')
+          ?.getAttribute("content") || "",
+      ogImage:
+        document
+          .querySelector('meta[property="og:image"]')
+          ?.getAttribute("content") || "",
+      canonicalUrl:
+        document.querySelector('link[rel="canonical"]')?.getAttribute("href") ||
+        "",
+    };
+    return meta;
+  });
+
+  // TODO: Maybe take fullscreen screenshots here, and then break them up into manageable chunks usable by the LLM.
+  // Take screenshot for context
+  const cdpSession = await page.context().newCDPSession(page);
+  const screenshot = await cdpSession.send("Page.captureScreenshot");
+  cdpSession.detach();
+
+  // TODO: Maybe use js-tiktoken here ?
+  // Trim markdown to stay within token limit
+  const avgTokensPerChar = 0.75;
+  const maxChars = Math.floor(tokenLimit / avgTokensPerChar);
+  const trimmedMarkdown =
+    markdown.length > maxChars
+      ? markdown.slice(0, maxChars) + "\n[Content truncated due to length]"
+      : markdown;
+
+  // Create messages
+  const messages = [
+    new SystemMessage(
+      `You are an expert at extracting structured information from web pages. Your task is to:
+1. Analyze the provided markdown content, metadata, and screenshot of a webpage
+2. Extract relevant information based on the provided task and schema (if any)
+3. Pay attention to both the text content and visual layout
+4. Handle cases where information might be split across different sections
+5. Ensure the response is complete and accurate
+6. Format the response appropriately based on the schema (if provided)
+
+Remember to:
+- Look for information in both the main content and page metadata (title, description, etc.)
+- Consider the visual hierarchy and layout of the page
+- Handle cases where information might be ambiguous or incomplete
+- Ensure the response is complete and accurate`
+    ),
+    new HumanMessage({
+      content: [
+        {
+          type: "text",
+          text: `Extract information from the page${task ? ` according to this task: ${task}` : ""}${schema ? " and format according to the schema" : ""}`,
+        },
+        { type: "text", text: "Here is the page metadata:" },
+        { type: "text", text: JSON.stringify(metadata, null, 2) },
+        { type: "text", text: "Here is the page content:" },
+        { type: "text", text: trimmedMarkdown },
+        { type: "text", text: "Here is a screenshot of the page:" },
+        {
+          type: "image_url",
+          image_url: {
+            url: `data:image/png;base64,${screenshot.data}`,
+          },
+        },
+      ],
+    }),
+  ];
+
+  if (schema) {
+    // Create structured output chain
+    const chain = llm.withStructuredOutput(schema);
+    const result = await chain.invoke(messages);
+    return result as T extends z.AnyZodObject ? z.infer<T> : string;
+  } else {
+    // For task-based extraction, get raw response
+    const response = await llm.invoke(messages);
+    return response.content as T extends z.AnyZodObject ? z.infer<T> : string;
+  }
+}