DiscourseGraphs · sid597 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/apps/website/app/api/ai/extract/route.ts b/apps/website/app/api/ai/extract/route.ts
@@ -0,0 +1,227 @@
+import { NextRequest, NextResponse } from "next/server";
+import {
+  ExtractionRequestSchema,
+  type ExtractionResponse,
+  type ProviderId,
+} from "~/types/extraction";
+import {
+  anthropicConfig,
+  openaiConfig,
+  geminiConfig,
+} from "~/utils/llm/providers";
+import type { LLMProviderConfig } from "~/types/llm";
+import { buildUserPrompt } from "~/prompts/extraction";
+import { parseExtractionResponse } from "~/utils/ai/parseExtractionResponse";
+
+/* eslint-disable @typescript-eslint/naming-convention */
+
+export const runtime = "nodejs";
+export const maxDuration = 300;
+
+type ExtractionParams = {
+  model: string;
+  systemPrompt: string;
+  pdfBase64: string;
+  userPrompt: string;
+};
+
+type ExtractionProviderConfig = {
+  base: LLMProviderConfig;
+  apiUrl: (model: string) => string;
+  buildRequestBody: (params: ExtractionParams) => unknown;
+  extractResponseText: (data: unknown) => string | undefined;
+};
+
+const PROVIDERS: Record<ProviderId, ExtractionProviderConfig> = {
+  anthropic: {
+    base: anthropicConfig,
+    apiUrl: () => "https://api.anthropic.com/v1/messages",
+    buildRequestBody: ({ model, systemPrompt, pdfBase64, userPrompt }) => ({
+      model,
+      max_tokens: 16384,
+      temperature: 0.2,
+      system: systemPrompt,
+      messages: [
+        {
+          role: "user",
+          content: [
+            {
+              type: "document",
+              source: {
+                type: "base64",
+                media_type: "application/pdf",
+                data: pdfBase64,
+              },
+            },
+            { type: "text", text: userPrompt },
+          ],
+        },
+      ],
+    }),
+    extractResponseText: (data) =>
+      anthropicConfig.extractResponseText(data) ?? undefined,
+  },
+  openai: {
+    base: openaiConfig,
+    apiUrl: () => "https://api.openai.com/v1/responses",
+    buildRequestBody: ({ model, systemPrompt, pdfBase64, userPrompt }) => ({
+      model,
+      instructions: systemPrompt,
+      input: [
+        {
+          role: "user",
+          content: [
+            {
+              type: "input_file",
+              filename: "paper.pdf",
+              file_data: `data:application/pdf;base64,${pdfBase64}`,
+            },
+            { type: "input_text", text: userPrompt },
+          ],
+        },
+      ],
+      temperature: 0.2,
+      max_output_tokens: 16384,
+    }),
+    extractResponseText: (data) => {
+      const resp = data as {
+        output?: {
+          type: string;
+          content?: { type: string; text: string }[];
+        }[];
+      };
+      const message = resp.output?.find((o) => o.type === "message");
+      return message?.content?.find((c) => c.type === "output_text")?.text;
+    },
+  },
+  gemini: {
+    base: geminiConfig,
+    apiUrl: (model) => {
+      const key = process.env[geminiConfig.apiKeyEnvVar];
+      return `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${key}`;
+    },
+    buildRequestBody: ({ systemPrompt, pdfBase64, userPrompt }) => ({
+      system_instruction: { parts: [{ text: systemPrompt }] },
+      contents: [
+        {
+          role: "user",
+          parts: [
+            {
+              inline_data: { mime_type: "application/pdf", data: pdfBase64 },
+            },
+            { text: userPrompt },
+          ],
+        },
+      ],
+      generationConfig: {
+        temperature: 0.2,
+        maxOutputTokens: 16384,
+        responseMimeType: "application/json",
+      },
+    }),
+    extractResponseText: (data) =>
+      geminiConfig.extractResponseText(data) ?? undefined,
+  },
+};
+
+export const POST = async (
+  request: NextRequest,
+): Promise<NextResponse<ExtractionResponse>> => {
+  let body: unknown;
+  try {
+    body = await request.json();
+  } catch {
+    return NextResponse.json(
+      { success: false, error: "Invalid JSON body" },
+      { status: 400 },
+    );
+  }
+
+  const parsed = ExtractionRequestSchema.safeParse(body);
+  if (!parsed.success) {
+    return NextResponse.json(
+      { success: false, error: parsed.error.message },
+      { status: 400 },
+    );
+  }
+
+  const {
+    pdfBase64,
+    researchQuestion,
+    nodeTypes,
+    model,
+    provider,
+    systemPrompt,
+  } = parsed.data;
+
+  const config = PROVIDERS[provider];
+  const apiKey = process.env[config.base.apiKeyEnvVar];
+
+  if (!apiKey) {
+    return NextResponse.json(
+      {
+        success: false,
+        error: `API key not configured for ${provider}.`,
+      },
+      { status: 500 },
+    );
+  }
+
+  const userPrompt = buildUserPrompt(nodeTypes, researchQuestion);
+
+  try {
+    const response = await fetch(config.apiUrl(model), {
+      method: "POST",
+      headers: config.base.apiHeaders(apiKey),
+      body: JSON.stringify(
+        config.buildRequestBody({
+          model,
+          systemPrompt,
+          pdfBase64,
+          userPrompt,
+        }),
+      ),
+      signal: AbortSignal.timeout(270_000),
+    });
+
+    if (!response.ok) {
+      const errorData: unknown = await response.json().catch(() => null);
+      const errorObj = errorData as { error?: { message?: string } } | null;
+      const message =
+        errorObj?.error?.message ?? `${provider} API error: ${response.status}`;
+      return NextResponse.json(
+        { success: false, error: message },
+        { status: 502 },
+      );
+    }
+
+    const responseData: unknown = await response.json();
+    const rawText = config.extractResponseText(responseData);
+
+    if (!rawText) {
+      return NextResponse.json(
+        { success: false, error: `Empty response from ${provider}` },
+        { status: 502 },
+      );
+    }
+
+    const result = parseExtractionResponse(rawText);
+    return NextResponse.json({ success: true, data: result });
+  } catch (error) {
+    const isUpstreamError =
+      error instanceof SyntaxError ||
+      (error instanceof Error && error.name === "ZodError");
+
+    const message = isUpstreamError
+      ? "Failed to parse extraction response — LLM returned invalid output"
+      : error instanceof Error
+        ? `Extraction failed — ${error.message}`
+        : "Extraction failed";
+
+    console.error("AI extraction failed:", error);
+    return NextResponse.json(
+      { success: false, error: message },
+      { status: isUpstreamError ? 502 : 500 },
+    );
+  }
+};
diff --git a/apps/website/app/prompts/extraction.ts b/apps/website/app/prompts/extraction.ts
@@ -0,0 +1,73 @@
+import { NODE_TYPE_LABELS, type NodeType } from "~/types/extraction";
+
+export const DEFAULT_EXTRACTION_PROMPT = `You are an expert research analyst specializing in extracting structured discourse graph nodes from academic papers.
+
+A discourse graph is a structured representation of the key intellectual contributions, claims, evidence, and questions in a body of research literature. Each node captures one atomic idea with a type tag.
+
+## Node types
+
+- **CLM (Claim)**: A specific, falsifiable assertion or argument made in the paper. Claims should be concise, standalone statements that capture a key point.
+- **QUE (Question)**: A research question posed or implied by the paper. These can be explicitly stated or inferred from gaps in the literature.
+- **EVD (Evidence)**: A specific piece of evidence (experimental result, statistical finding, observation) that supports or refutes a claim.
+- **SRC (Source)**: A bibliographic source referenced in the paper that is relevant to the discourse.
+- **ISS (Issue)**: A problem, challenge, or open issue identified in the paper. Represents unresolved tensions or difficulties.
+- **RES (Result)**: A specific finding or outcome reported in the paper, typically from experiments or analyses.
+- **HYP (Hypothesis)**: A testable prediction or proposed explanation that the paper investigates.
+- **CON (Conclusion)**: A final synthesized takeaway or implication drawn by the authors.
+- **EXP (Experiment)**: A described experimental procedure, study, or empirical investigation.
+- **THR (Theory)**: A theoretical framework, model, or conceptual lens used or proposed in the paper.
+- **ART (Artifact)**: A concrete artifact produced or used — a tool, dataset, software, protocol, or instrument.
+- **MTD (Method)**: A methodology, technique, or analytical approach described or applied.
+- **PAT (Pattern)**: A recurring pattern, trend, or regularity identified across data or literature.
+- **PRJ (Project)**: A named research project, initiative, or collaborative effort referenced in the paper.
+- **PRB (Problem)**: A well-defined problem that the paper addresses or formulates, distinct from a general issue.
+
+## Extraction guidelines
+
+- Extract meaningful, substantive nodes — avoid trivial or overly generic statements.
+- Claims should be specific enough to be debatable.
+- Evidence should include quantitative details when available.
+- Questions should be open-ended and research-worthy.
+- Sources should include author names and year when available.
+- Results should capture specific findings, not vague summaries.
+- Conclusions should be high-level takeaways distinct from individual claims.
+- Problems should be well-scoped, not restated issues.
+- For each node, include a short supporting snippet (exact quote or figure/table reference) from the paper.
+- Include the section name and page number when determinable.
+- Aim for 10–25 nodes depending on paper length and density.
+- Prefer quality over quantity.
+
+## Output format
+
+Respond with ONLY valid JSON (no markdown fences, no commentary) matching this structure:
+
+{
+  "paperTitle": "Title of the paper",
+  "paperAuthors": ["Author 1", "Author 2"],
+  "candidates": [
+    {
+      "nodeType": "CLM",
+      "content": "The extracted node text as a clear, concise statement",
+      "supportSnippet": "Short exact quote or figure/table reference from the paper",
+      "sourceSection": "Results",
+      "pageNumber": 3
+    }
+  ]
+}`;
+
+export const buildUserPrompt = (
+  nodeTypes: NodeType[],
+  researchQuestion?: string,
+): string => {
+  const typeList = nodeTypes
+    .map((t) => `${t} (${NODE_TYPE_LABELS[t]})`)
+    .join(", ");
+
+  let prompt = `Extract the following node types from the attached paper: ${typeList}`;
+
+  if (researchQuestion) {
+    prompt += `\n\nFocus extraction around this research question: ${researchQuestion}`;
+  }
+
+  return prompt;
+};
diff --git a/apps/website/app/types/extraction.ts b/apps/website/app/types/extraction.ts
@@ -0,0 +1,78 @@
+import { z } from "zod";
+
+/* eslint-disable @typescript-eslint/naming-convention */
+
+export const NODE_TYPES = [
+  "CLM",
+  "QUE",
+  "EVD",
+  "SRC",
+  "ISS",
+  "RES",
+  "HYP",
+  "CON",
+  "EXP",
+  "THR",
+  "ART",
+  "MTD",
+  "PAT",
+  "PRJ",
+  "PRB",
+] as const;
+
+export type NodeType = (typeof NODE_TYPES)[number];
+
+export const NODE_TYPE_LABELS: Record<NodeType, string> = {
+  CLM: "Claim",
+  QUE: "Question",
+  EVD: "Evidence",
+  SRC: "Source",
+  ISS: "Issue",
+  RES: "Result",
+  HYP: "Hypothesis",
+  CON: "Conclusion",
+  EXP: "Experiment",
+  THR: "Theory",
+  ART: "Artifact",
+  MTD: "Method",
+  PAT: "Pattern",
+  PRJ: "Project",
+  PRB: "Problem",
+};
+
+export const PROVIDER_IDS = ["anthropic", "openai", "gemini"] as const;
+
+export type ProviderId = (typeof PROVIDER_IDS)[number];
+
+export const CandidateNodeSchema = z.object({
+  nodeType: z.enum(NODE_TYPES),
+  content: z.string(),
+  supportSnippet: z.string(),
+  sourceSection: z.string().optional(),
+  pageNumber: z.number().optional(),
+});
+
+export type CandidateNode = z.infer<typeof CandidateNodeSchema>;
+
+export const ExtractionResultSchema = z.object({
+  paperTitle: z.string(),
+  paperAuthors: z.array(z.string()),
+  candidates: z.array(CandidateNodeSchema),
+});
+
+export type ExtractionResult = z.infer<typeof ExtractionResultSchema>;
+
+export const ExtractionRequestSchema = z.object({
+  pdfBase64: z.string().min(1).max(44_000_000),
+  researchQuestion: z.string().optional(),
+  nodeTypes: z.array(z.enum(NODE_TYPES)).min(1),
+  model: z.string().min(1),
+  provider: z.enum(PROVIDER_IDS),
+  systemPrompt: z.string().min(1),
+});
+
+export type ExtractionRequest = z.infer<typeof ExtractionRequestSchema>;
+
+export type ExtractionResponse =
+  | { success: true; data: ExtractionResult }
+  | { success: false; error: string };