diff --git a/apps/website/app/api/ai/extract/route.ts b/apps/website/app/api/ai/extract/route.ts new file mode 100644 index 000000000..dad956de4 --- /dev/null +++ b/apps/website/app/api/ai/extract/route.ts @@ -0,0 +1,227 @@ +import { NextRequest, NextResponse } from "next/server"; +import { + ExtractionRequestSchema, + type ExtractionResponse, + type ProviderId, +} from "~/types/extraction"; +import { + anthropicConfig, + openaiConfig, + geminiConfig, +} from "~/utils/llm/providers"; +import type { LLMProviderConfig } from "~/types/llm"; +import { buildUserPrompt } from "~/prompts/extraction"; +import { parseExtractionResponse } from "~/utils/ai/parseExtractionResponse"; + +/* eslint-disable @typescript-eslint/naming-convention */ + +export const runtime = "nodejs"; +export const maxDuration = 300; + +type ExtractionParams = { + model: string; + systemPrompt: string; + pdfBase64: string; + userPrompt: string; +}; + +type ExtractionProviderConfig = { + base: LLMProviderConfig; + apiUrl: (model: string) => string; + buildRequestBody: (params: ExtractionParams) => unknown; + extractResponseText: (data: unknown) => string | undefined; +}; + +const PROVIDERS: Record = { + anthropic: { + base: anthropicConfig, + apiUrl: () => "https://api.anthropic.com/v1/messages", + buildRequestBody: ({ model, systemPrompt, pdfBase64, userPrompt }) => ({ + model, + max_tokens: 16384, + temperature: 0.2, + system: systemPrompt, + messages: [ + { + role: "user", + content: [ + { + type: "document", + source: { + type: "base64", + media_type: "application/pdf", + data: pdfBase64, + }, + }, + { type: "text", text: userPrompt }, + ], + }, + ], + }), + extractResponseText: (data) => + anthropicConfig.extractResponseText(data) ?? undefined, + }, + openai: { + base: openaiConfig, + apiUrl: () => "https://api.openai.com/v1/responses", + buildRequestBody: ({ model, systemPrompt, pdfBase64, userPrompt }) => ({ + model, + instructions: systemPrompt, + input: [ + { + role: "user", + content: [ + { + type: "input_file", + filename: "paper.pdf", + file_data: `data:application/pdf;base64,${pdfBase64}`, + }, + { type: "input_text", text: userPrompt }, + ], + }, + ], + temperature: 0.2, + max_output_tokens: 16384, + }), + extractResponseText: (data) => { + const resp = data as { + output?: { + type: string; + content?: { type: string; text: string }[]; + }[]; + }; + const message = resp.output?.find((o) => o.type === "message"); + return message?.content?.find((c) => c.type === "output_text")?.text; + }, + }, + gemini: { + base: geminiConfig, + apiUrl: (model) => { + const key = process.env[geminiConfig.apiKeyEnvVar]; + return `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${key}`; + }, + buildRequestBody: ({ systemPrompt, pdfBase64, userPrompt }) => ({ + system_instruction: { parts: [{ text: systemPrompt }] }, + contents: [ + { + role: "user", + parts: [ + { + inline_data: { mime_type: "application/pdf", data: pdfBase64 }, + }, + { text: userPrompt }, + ], + }, + ], + generationConfig: { + temperature: 0.2, + maxOutputTokens: 16384, + responseMimeType: "application/json", + }, + }), + extractResponseText: (data) => + geminiConfig.extractResponseText(data) ?? undefined, + }, +}; + +export const POST = async ( + request: NextRequest, +): Promise> => { + let body: unknown; + try { + body = await request.json(); + } catch { + return NextResponse.json( + { success: false, error: "Invalid JSON body" }, + { status: 400 }, + ); + } + + const parsed = ExtractionRequestSchema.safeParse(body); + if (!parsed.success) { + return NextResponse.json( + { success: false, error: parsed.error.message }, + { status: 400 }, + ); + } + + const { + pdfBase64, + researchQuestion, + nodeTypes, + model, + provider, + systemPrompt, + } = parsed.data; + + const config = PROVIDERS[provider]; + const apiKey = process.env[config.base.apiKeyEnvVar]; + + if (!apiKey) { + return NextResponse.json( + { + success: false, + error: `API key not configured for ${provider}.`, + }, + { status: 500 }, + ); + } + + const userPrompt = buildUserPrompt(nodeTypes, researchQuestion); + + try { + const response = await fetch(config.apiUrl(model), { + method: "POST", + headers: config.base.apiHeaders(apiKey), + body: JSON.stringify( + config.buildRequestBody({ + model, + systemPrompt, + pdfBase64, + userPrompt, + }), + ), + signal: AbortSignal.timeout(270_000), + }); + + if (!response.ok) { + const errorData: unknown = await response.json().catch(() => null); + const errorObj = errorData as { error?: { message?: string } } | null; + const message = + errorObj?.error?.message ?? `${provider} API error: ${response.status}`; + return NextResponse.json( + { success: false, error: message }, + { status: 502 }, + ); + } + + const responseData: unknown = await response.json(); + const rawText = config.extractResponseText(responseData); + + if (!rawText) { + return NextResponse.json( + { success: false, error: `Empty response from ${provider}` }, + { status: 502 }, + ); + } + + const result = parseExtractionResponse(rawText); + return NextResponse.json({ success: true, data: result }); + } catch (error) { + const isUpstreamError = + error instanceof SyntaxError || + (error instanceof Error && error.name === "ZodError"); + + const message = isUpstreamError + ? "Failed to parse extraction response — LLM returned invalid output" + : error instanceof Error + ? `Extraction failed — ${error.message}` + : "Extraction failed"; + + console.error("AI extraction failed:", error); + return NextResponse.json( + { success: false, error: message }, + { status: isUpstreamError ? 502 : 500 }, + ); + } +}; diff --git a/apps/website/app/prompts/extraction.ts b/apps/website/app/prompts/extraction.ts new file mode 100644 index 000000000..a8301f051 --- /dev/null +++ b/apps/website/app/prompts/extraction.ts @@ -0,0 +1,73 @@ +import { NODE_TYPE_LABELS, type NodeType } from "~/types/extraction"; + +export const DEFAULT_EXTRACTION_PROMPT = `You are an expert research analyst specializing in extracting structured discourse graph nodes from academic papers. + +A discourse graph is a structured representation of the key intellectual contributions, claims, evidence, and questions in a body of research literature. Each node captures one atomic idea with a type tag. + +## Node types + +- **CLM (Claim)**: A specific, falsifiable assertion or argument made in the paper. Claims should be concise, standalone statements that capture a key point. +- **QUE (Question)**: A research question posed or implied by the paper. These can be explicitly stated or inferred from gaps in the literature. +- **EVD (Evidence)**: A specific piece of evidence (experimental result, statistical finding, observation) that supports or refutes a claim. +- **SRC (Source)**: A bibliographic source referenced in the paper that is relevant to the discourse. +- **ISS (Issue)**: A problem, challenge, or open issue identified in the paper. Represents unresolved tensions or difficulties. +- **RES (Result)**: A specific finding or outcome reported in the paper, typically from experiments or analyses. +- **HYP (Hypothesis)**: A testable prediction or proposed explanation that the paper investigates. +- **CON (Conclusion)**: A final synthesized takeaway or implication drawn by the authors. +- **EXP (Experiment)**: A described experimental procedure, study, or empirical investigation. +- **THR (Theory)**: A theoretical framework, model, or conceptual lens used or proposed in the paper. +- **ART (Artifact)**: A concrete artifact produced or used — a tool, dataset, software, protocol, or instrument. +- **MTD (Method)**: A methodology, technique, or analytical approach described or applied. +- **PAT (Pattern)**: A recurring pattern, trend, or regularity identified across data or literature. +- **PRJ (Project)**: A named research project, initiative, or collaborative effort referenced in the paper. +- **PRB (Problem)**: A well-defined problem that the paper addresses or formulates, distinct from a general issue. + +## Extraction guidelines + +- Extract meaningful, substantive nodes — avoid trivial or overly generic statements. +- Claims should be specific enough to be debatable. +- Evidence should include quantitative details when available. +- Questions should be open-ended and research-worthy. +- Sources should include author names and year when available. +- Results should capture specific findings, not vague summaries. +- Conclusions should be high-level takeaways distinct from individual claims. +- Problems should be well-scoped, not restated issues. +- For each node, include a short supporting snippet (exact quote or figure/table reference) from the paper. +- Include the section name and page number when determinable. +- Aim for 10–25 nodes depending on paper length and density. +- Prefer quality over quantity. + +## Output format + +Respond with ONLY valid JSON (no markdown fences, no commentary) matching this structure: + +{ + "paperTitle": "Title of the paper", + "paperAuthors": ["Author 1", "Author 2"], + "candidates": [ + { + "nodeType": "CLM", + "content": "The extracted node text as a clear, concise statement", + "supportSnippet": "Short exact quote or figure/table reference from the paper", + "sourceSection": "Results", + "pageNumber": 3 + } + ] +}`; + +export const buildUserPrompt = ( + nodeTypes: NodeType[], + researchQuestion?: string, +): string => { + const typeList = nodeTypes + .map((t) => `${t} (${NODE_TYPE_LABELS[t]})`) + .join(", "); + + let prompt = `Extract the following node types from the attached paper: ${typeList}`; + + if (researchQuestion) { + prompt += `\n\nFocus extraction around this research question: ${researchQuestion}`; + } + + return prompt; +}; diff --git a/apps/website/app/types/extraction.ts b/apps/website/app/types/extraction.ts new file mode 100644 index 000000000..732115d24 --- /dev/null +++ b/apps/website/app/types/extraction.ts @@ -0,0 +1,78 @@ +import { z } from "zod"; + +/* eslint-disable @typescript-eslint/naming-convention */ + +export const NODE_TYPES = [ + "CLM", + "QUE", + "EVD", + "SRC", + "ISS", + "RES", + "HYP", + "CON", + "EXP", + "THR", + "ART", + "MTD", + "PAT", + "PRJ", + "PRB", +] as const; + +export type NodeType = (typeof NODE_TYPES)[number]; + +export const NODE_TYPE_LABELS: Record = { + CLM: "Claim", + QUE: "Question", + EVD: "Evidence", + SRC: "Source", + ISS: "Issue", + RES: "Result", + HYP: "Hypothesis", + CON: "Conclusion", + EXP: "Experiment", + THR: "Theory", + ART: "Artifact", + MTD: "Method", + PAT: "Pattern", + PRJ: "Project", + PRB: "Problem", +}; + +export const PROVIDER_IDS = ["anthropic", "openai", "gemini"] as const; + +export type ProviderId = (typeof PROVIDER_IDS)[number]; + +export const CandidateNodeSchema = z.object({ + nodeType: z.enum(NODE_TYPES), + content: z.string(), + supportSnippet: z.string(), + sourceSection: z.string().optional(), + pageNumber: z.number().optional(), +}); + +export type CandidateNode = z.infer; + +export const ExtractionResultSchema = z.object({ + paperTitle: z.string(), + paperAuthors: z.array(z.string()), + candidates: z.array(CandidateNodeSchema), +}); + +export type ExtractionResult = z.infer; + +export const ExtractionRequestSchema = z.object({ + pdfBase64: z.string().min(1).max(44_000_000), + researchQuestion: z.string().optional(), + nodeTypes: z.array(z.enum(NODE_TYPES)).min(1), + model: z.string().min(1), + provider: z.enum(PROVIDER_IDS), + systemPrompt: z.string().min(1), +}); + +export type ExtractionRequest = z.infer; + +export type ExtractionResponse = + | { success: true; data: ExtractionResult } + | { success: false; error: string }; diff --git a/apps/website/app/utils/ai/parseExtractionResponse.ts b/apps/website/app/utils/ai/parseExtractionResponse.ts new file mode 100644 index 000000000..def23d4ec --- /dev/null +++ b/apps/website/app/utils/ai/parseExtractionResponse.ts @@ -0,0 +1,23 @@ +import { + ExtractionResultSchema, + type ExtractionResult, +} from "~/types/extraction"; + +export const parseExtractionResponse = (raw: string): ExtractionResult => { + let cleaned = raw.trim(); + + if (cleaned.startsWith("```")) { + cleaned = cleaned + .replace(/^```(?:json)?\s*\n?/, "") + .replace(/\n?```\s*$/, ""); + } + + const firstBrace = cleaned.indexOf("{"); + const lastBrace = cleaned.lastIndexOf("}"); + if (firstBrace !== -1 && lastBrace > firstBrace) { + cleaned = cleaned.slice(firstBrace, lastBrace + 1); + } + + const parsed: unknown = JSON.parse(cleaned); + return ExtractionResultSchema.parse(parsed); +};