|
| 1 | +import { z } from "zod"; |
| 2 | +import { parseMarkdown } from "@/utils/html-to-markdown"; |
| 3 | +import { Page } from "playwright"; |
| 4 | +import { BaseChatModel } from "@langchain/core/language_models/chat_models"; |
| 5 | +import { HumanMessage, SystemMessage } from "@langchain/core/messages"; |
| 6 | + |
| 7 | +export interface ExtractOptions< |
| 8 | + T extends z.AnyZodObject | undefined = z.AnyZodObject, |
| 9 | +> { |
| 10 | + schema?: T; |
| 11 | + task?: string; |
| 12 | + page: Page; |
| 13 | + llm: BaseChatModel; |
| 14 | + tokenLimit?: number; |
| 15 | +} |
| 16 | + |
| 17 | +export async function PageExtractFn< |
| 18 | + T extends z.AnyZodObject | undefined = z.AnyZodObject, |
| 19 | +>({ |
| 20 | + schema, |
| 21 | + task, |
| 22 | + page, |
| 23 | + llm, |
| 24 | + tokenLimit = 4000, |
| 25 | +}: ExtractOptions<T>): Promise<T extends z.AnyZodObject ? z.infer<T> : string> { |
| 26 | + if (!schema && !task) { |
| 27 | + throw new Error("Either schema or task must be provided"); |
| 28 | + } |
| 29 | + |
| 30 | + // Get page content and convert to markdown |
| 31 | + const content = await page.content(); |
| 32 | + const markdown = await parseMarkdown(content); |
| 33 | + |
| 34 | + // Get page metadata |
| 35 | + const metadata = await page.evaluate(() => { |
| 36 | + const meta = { |
| 37 | + title: document.title, |
| 38 | + description: |
| 39 | + document |
| 40 | + .querySelector('meta[name="description"]') |
| 41 | + ?.getAttribute("content") || "", |
| 42 | + keywords: |
| 43 | + document |
| 44 | + .querySelector('meta[name="keywords"]') |
| 45 | + ?.getAttribute("content") || "", |
| 46 | + ogTitle: |
| 47 | + document |
| 48 | + .querySelector('meta[property="og:title"]') |
| 49 | + ?.getAttribute("content") || "", |
| 50 | + ogDescription: |
| 51 | + document |
| 52 | + .querySelector('meta[property="og:description"]') |
| 53 | + ?.getAttribute("content") || "", |
| 54 | + ogImage: |
| 55 | + document |
| 56 | + .querySelector('meta[property="og:image"]') |
| 57 | + ?.getAttribute("content") || "", |
| 58 | + canonicalUrl: |
| 59 | + document.querySelector('link[rel="canonical"]')?.getAttribute("href") || |
| 60 | + "", |
| 61 | + }; |
| 62 | + return meta; |
| 63 | + }); |
| 64 | + |
| 65 | + // TODO: Maybe take fullscreen screenshots here, and then break them up into manageable chunks usable by the LLM. |
| 66 | + // Take screenshot for context |
| 67 | + const cdpSession = await page.context().newCDPSession(page); |
| 68 | + const screenshot = await cdpSession.send("Page.captureScreenshot"); |
| 69 | + cdpSession.detach(); |
| 70 | + |
| 71 | + // TODO: Maybe use js-tiktoken here ? |
| 72 | + // Trim markdown to stay within token limit |
| 73 | + const avgTokensPerChar = 0.75; |
| 74 | + const maxChars = Math.floor(tokenLimit / avgTokensPerChar); |
| 75 | + const trimmedMarkdown = |
| 76 | + markdown.length > maxChars |
| 77 | + ? markdown.slice(0, maxChars) + "\n[Content truncated due to length]" |
| 78 | + : markdown; |
| 79 | + |
| 80 | + // Create messages |
| 81 | + const messages = [ |
| 82 | + new SystemMessage( |
| 83 | + `You are an expert at extracting structured information from web pages. Your task is to: |
| 84 | +1. Analyze the provided markdown content, metadata, and screenshot of a webpage |
| 85 | +2. Extract relevant information based on the provided task and schema (if any) |
| 86 | +3. Pay attention to both the text content and visual layout |
| 87 | +4. Handle cases where information might be split across different sections |
| 88 | +5. Ensure the response is complete and accurate |
| 89 | +6. Format the response appropriately based on the schema (if provided) |
| 90 | +
|
| 91 | +Remember to: |
| 92 | +- Look for information in both the main content and page metadata (title, description, etc.) |
| 93 | +- Consider the visual hierarchy and layout of the page |
| 94 | +- Handle cases where information might be ambiguous or incomplete |
| 95 | +- Ensure the response is complete and accurate` |
| 96 | + ), |
| 97 | + new HumanMessage({ |
| 98 | + content: [ |
| 99 | + { |
| 100 | + type: "text", |
| 101 | + text: `Extract information from the page${task ? ` according to this task: ${task}` : ""}${schema ? " and format according to the schema" : ""}`, |
| 102 | + }, |
| 103 | + { type: "text", text: "Here is the page metadata:" }, |
| 104 | + { type: "text", text: JSON.stringify(metadata, null, 2) }, |
| 105 | + { type: "text", text: "Here is the page content:" }, |
| 106 | + { type: "text", text: trimmedMarkdown }, |
| 107 | + { type: "text", text: "Here is a screenshot of the page:" }, |
| 108 | + { |
| 109 | + type: "image_url", |
| 110 | + image_url: { |
| 111 | + url: `data:image/png;base64,${screenshot.data}`, |
| 112 | + }, |
| 113 | + }, |
| 114 | + ], |
| 115 | + }), |
| 116 | + ]; |
| 117 | + |
| 118 | + if (schema) { |
| 119 | + // Create structured output chain |
| 120 | + const chain = llm.withStructuredOutput(schema); |
| 121 | + const result = await chain.invoke(messages); |
| 122 | + return result as T extends z.AnyZodObject ? z.infer<T> : string; |
| 123 | + } else { |
| 124 | + // For task-based extraction, get raw response |
| 125 | + const response = await llm.invoke(messages); |
| 126 | + return response.content as T extends z.AnyZodObject ? z.infer<T> : string; |
| 127 | + } |
| 128 | +} |
0 commit comments