diff --git a/package.json b/package.json index 5cc043fa32..ae90b6d96d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "firecrawl-cli", - "version": "1.19.12", + "version": "1.19.13", "description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.", "main": "dist/index.js", "bin": { diff --git a/src/commands/research.ts b/src/commands/research.ts new file mode 100644 index 0000000000..5d8d70c3e8 --- /dev/null +++ b/src/commands/research.ts @@ -0,0 +1,305 @@ +import { getClient, isKeylessMode, keylessGet } from '../utils/client'; +import { writeOutput } from '../utils/output'; +import type { + GitHubItem, + InspectPaperOptions, + PaperHit, + ReadPaperOptions, + RelatedPapersOptions, + ResearchBaseOptions, + SearchGitHubOptions, + SearchPapersOptions, +} from '../types/research'; + +const BASE = '/v2/search/research'; +const MAX_AUTHORS = 15; +const MAX_ABSTRACT_CHARS = 600; +const MAX_AFFIL_CHARS = 60; +const MAX_AUTHORS_LINE_CHARS = 400; +const MAX_GITHUB_CONTENT_CHARS = 1200; + +function appendParam( + params: URLSearchParams, + key: string, + value: string | number | boolean | string[] | undefined +): void { + if (value == null) return; + if (Array.isArray(value)) { + for (const item of value) { + if (item != null && String(item).length > 0) { + params.append(key, String(item)); + } + } + return; + } + params.append(key, String(value)); +} + +function withQuery(path: string, params: URLSearchParams): string { + const qs = params.toString(); + return qs ? `${path}?${qs}` : path; +} + +async function getResearch( + path: string, + options: ResearchBaseOptions +): Promise { + if (isKeylessMode(options.apiKey, options.apiUrl)) { + return (await keylessGet(path)) as T; + } + + const app = getClient({ apiKey: options.apiKey, apiUrl: options.apiUrl }); + const response = await (app as any).http.get(path); + return (response?.data ?? {}) as T; +} + +function displayId(paper: PaperHit): string { + return paper.primaryId ?? 'missing-primary-id'; +} + +function fmtAuthors( + authors?: string | { name: string; affiliation?: string }[] +): string | null { + if (!authors) return null; + + let shown: string[]; + let total: number; + if (typeof authors === 'string') { + const names = authors + .split(',') + .map((name) => name.trim()) + .filter(Boolean); + if (names.length === 0) return null; + total = names.length; + shown = names.slice(0, MAX_AUTHORS); + } else { + if (authors.length === 0) return null; + total = authors.length; + shown = authors.slice(0, MAX_AUTHORS).map((author) => { + const affiliation = author.affiliation?.trim(); + return affiliation + ? `${author.name} (${affiliation.slice(0, MAX_AFFIL_CHARS)})` + : author.name; + }); + } + + const extra = total > MAX_AUTHORS ? `; +${total - MAX_AUTHORS} more` : ''; + return ('Authors: ' + shown.join('; ') + extra).slice( + 0, + MAX_AUTHORS_LINE_CHARS + ); +} + +function fmtHits(results?: PaperHit[]): string { + if (!results || results.length === 0) return '(no results)'; + + return results + .map((paper) => { + const lines = [`## [${displayId(paper)}] ${paper.title ?? '(untitled)'}`]; + const authors = fmtAuthors(paper.authors); + if (authors) lines.push(authors); + lines.push( + (paper.abstract || '(no abstract)') + .replace(/\s+/g, ' ') + .slice(0, MAX_ABSTRACT_CHARS) + ); + return lines.join('\n'); + }) + .join('\n\n'); +} + +function fmtPaperMetadata(paper?: PaperHit): string { + if (!paper) return '(paper not found)'; + + const lines = [`# ${paper.title ?? '(untitled)'}`, '']; + lines.push(`Paper ID: ${paper.paperId ?? '?'}`); + + const ids = Object.entries(paper.ids ?? {}) + .flatMap(([namespace, values]) => + values.map((value) => `${namespace}:${value}`) + ) + .join(', '); + if (ids) lines.push(`IDs: ${ids}`); + + const authors = fmtAuthors(paper.authors); + if (authors) lines.push(authors); + + if (paper.categories?.length) { + lines.push(`Categories: ${paper.categories.join(', ')}`); + } + + const dates = [ + paper.createdDate ? `created ${paper.createdDate}` : '', + paper.updateDate ? `updated ${paper.updateDate}` : '', + ] + .filter(Boolean) + .join('; '); + if (dates) lines.push(`Dates: ${dates}`); + + lines.push('', '## Abstract'); + lines.push((paper.abstract || '(no abstract)').replace(/\s+/g, ' ')); + return lines.join('\n'); +} + +function fmtGithub(results?: GitHubItem[]): string { + if (!results || results.length === 0) return '(no results)'; + + return results + .map((item) => { + const lines: string[] = []; + if (item.resultType === 'repo_readme') { + lines.push(`[${item.repo ?? '?'}] README`); + } else { + const ref = item.number != null ? `#${item.number}` : ''; + const meta = [ + item.pageType, + item.segmentCount ? `${item.segmentCount} segments` : '', + ] + .filter(Boolean) + .join(', '); + lines.push(`[${item.repo ?? '?'}${ref}]${meta ? ` (${meta})` : ''}`); + } + const url = item.readmeUrl ?? item.url; + if (url) lines.push(url); + const body = (item.contentMd || item.snippet || '').trim(); + lines.push( + body ? body.slice(0, MAX_GITHUB_CONTENT_CHARS) : '(no content)' + ); + return lines.join('\n'); + }) + .join('\n\n'); +} + +function writeResearchOutput( + data: unknown, + readable: string, + options: ResearchBaseOptions +): void { + const content = + options.json || options.pretty + ? options.pretty + ? JSON.stringify(data, null, 2) + : JSON.stringify(data) + : readable; + writeOutput(content, options.output, !!options.output); +} + +function handleError(error: unknown): never { + console.error( + 'Error:', + error instanceof Error ? error.message : 'Unknown error occurred' + ); + process.exit(1); +} + +export async function handleSearchPapersCommand( + options: SearchPapersOptions +): Promise { + try { + const params = new URLSearchParams(); + appendParam(params, 'query', options.query); + appendParam(params, 'k', options.k); + appendParam(params, 'authors', options.authors); + appendParam(params, 'categories', options.categories); + appendParam(params, 'from', options.from); + appendParam(params, 'to', options.to); + const data = await getResearch<{ results?: PaperHit[] }>( + withQuery(`${BASE}/papers`, params), + options + ); + writeResearchOutput(data, fmtHits(data.results), options); + } catch (error) { + handleError(error); + } +} + +export async function handleInspectPaperCommand( + options: InspectPaperOptions +): Promise { + try { + const data = await getResearch<{ paper?: PaperHit }>( + `${BASE}/papers/${encodeURIComponent(options.paperId)}`, + options + ); + writeResearchOutput(data, fmtPaperMetadata(data.paper), options); + } catch (error) { + handleError(error); + } +} + +export async function handleRelatedPapersCommand( + options: RelatedPapersOptions +): Promise { + try { + const [primary, ...anchors] = options.seedIds; + const params = new URLSearchParams(); + appendParam(params, 'intent', options.intent); + appendParam(params, 'mode', options.mode); + appendParam(params, 'k', options.k); + appendParam(params, 'rerank', options.rerank); + appendParam(params, 'anchor', anchors); + const data = await getResearch<{ + results?: PaperHit[]; + poolSize?: number; + note?: string | null; + }>( + withQuery( + `${BASE}/papers/${encodeURIComponent(primary)}/similar`, + params + ), + options + ); + const note = data.note ? `\nnote: ${data.note}` : ''; + writeResearchOutput( + data, + `${fmtHits(data.results)}\n(poolSize=${data.poolSize ?? 0})${note}`, + options + ); + } catch (error) { + handleError(error); + } +} + +export async function handleReadPaperCommand( + options: ReadPaperOptions +): Promise { + try { + const params = new URLSearchParams(); + appendParam(params, 'query', options.question); + appendParam(params, 'k', options.k); + const data = await getResearch<{ passages?: { text: string }[] }>( + withQuery( + `${BASE}/papers/${encodeURIComponent(options.paperId)}`, + params + ), + options + ); + const passages = data.passages ?? []; + writeResearchOutput( + data, + passages.length + ? passages.map((passage) => passage.text).join('\n---\n') + : '(no full-text passages available for this paper)', + options + ); + } catch (error) { + handleError(error); + } +} + +export async function handleSearchGitHubCommand( + options: SearchGitHubOptions +): Promise { + try { + const params = new URLSearchParams(); + appendParam(params, 'query', options.query); + appendParam(params, 'k', options.k); + const data = await getResearch<{ results?: GitHubItem[] }>( + withQuery(`${BASE}/github`, params), + options + ); + writeResearchOutput(data, fmtGithub(data.results), options); + } catch (error) { + handleError(error); + } +} diff --git a/src/index.ts b/src/index.ts index 9dedb48472..6ebd860e6b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -5,7 +5,7 @@ * Entry point for the CLI application */ -import { Command } from 'commander'; +import { Command, Option } from 'commander'; import { readFileSync } from 'fs'; import { handleScrapeCommand, @@ -20,6 +20,13 @@ import { handleMapCommand } from './commands/map'; import { handleParseCommand } from './commands/parse'; import { createMonitorCommand } from './commands/monitor'; import { handleSearchCommand } from './commands/search'; +import { + handleInspectPaperCommand, + handleReadPaperCommand, + handleRelatedPapersCommand, + handleSearchGitHubCommand, + handleSearchPapersCommand, +} from './commands/research'; import { handleSearchFeedbackCommand, parseValuableSourcesArg, @@ -61,6 +68,7 @@ import { ensureAuthenticated, printBanner } from './utils/auth'; import packageJson from '../package.json'; import type { SearchSource, SearchCategory } from './types/search'; import type { ScrapeFormat } from './types/scrape'; +import type { RelatedPapersOptions } from './types/research'; import type { AgentWebhookConfig } from 'firecrawl'; import { createCreateCommand } from './commands/create'; @@ -199,6 +207,22 @@ function parseWebhookOption( return trimmed; } +function parseCommaList(raw: string | undefined): string[] | undefined { + if (!raw) return undefined; + const values = raw + .split(',') + .map((value) => value.trim()) + .filter(Boolean); + return values.length > 0 ? values : undefined; +} + +function researchLimit(options: { + limit?: number; + k?: number; +}): number | undefined { + return options.k ?? options.limit; +} + function parseAgentWebhookOption( raw: string | undefined, label: string @@ -1012,6 +1036,240 @@ function createSearchCommand(): Command { return searchCmd; } +/** + * Create and configure the research command group + */ +function createResearchCommand(): Command { + const researchCmd = new Command('research') + .description('Research arXiv papers and GitHub history using Firecrawl') + .addHelpText( + 'after', + ` +Examples: + $ firecrawl research search-papers "diffusion image synthesis" --limit 20 + $ firecrawl research inspect-paper arxiv:1706.03762 + $ firecrawl research related-papers arxiv:1706.03762 --intent "efficient transformers" + $ firecrawl research read-paper arxiv:1706.03762 --question "What is the attention mechanism?" + $ firecrawl research search-github "foundationdb queue worker shutdown" --limit 10 +` + ); + + researchCmd + .command('search-papers') + .description( + 'Primary entry point for finding arXiv papers by topic. Semantic (HyDE) search over arXiv abstracts; returns ranked papers with arXiv id, title, and abstract. The query should be a natural-language description of what you want. Run several distinct framings of the question rather than one query. Returns up to k results (default 40).' + ) + .argument('', 'Natural-language description of the papers to find') + .option( + '--limit ', + 'Number of results to return (default: 40)', + parseInt + ) + .addOption(new Option('--k ').argParser(parseInt).hideHelp()) + .option( + '--authors ', + 'Comma-separated author substring filter(s); all must match case-insensitively' + ) + .option( + '--categories ', + 'Comma-separated arXiv category filter(s), e.g. cs.LG,cs.IR; all must match' + ) + .option( + '--from ', + 'Inclusive lower bound on created/updated date (YYYY-MM-DD)' + ) + .option( + '--to ', + 'Inclusive upper bound on created/updated date (YYYY-MM-DD)' + ) + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('--api-url ', 'API URL (overrides global --api-url)') + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--json', 'Output as compact JSON', false) + .option('--pretty', 'Pretty print JSON output', false) + .action(async (query, options) => { + await handleSearchPapersCommand({ + query, + k: researchLimit(options), + authors: parseCommaList(options.authors), + categories: parseCommaList(options.categories), + from: options.from, + to: options.to, + apiKey: options.apiKey, + apiUrl: options.apiUrl, + output: options.output, + json: options.json, + pretty: options.pretty, + }); + }); + + researchCmd + .command('inspect-paper') + .description( + 'Fetch canonical metadata for one paper by primaryId or canonical paperId. Use this after search/related results when you need the full title, abstract, authors, categories, source ids, and dates rendered as markdown.' + ) + .argument( + '', + 'Canonical paperId or primaryId such as arxiv:1706.03762, pmcid:PMC12530322, pmid:40953549, or doi:10.1016/j.neunet.2025.108095' + ) + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('--api-url ', 'API URL (overrides global --api-url)') + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--json', 'Output as compact JSON', false) + .option('--pretty', 'Pretty print JSON output', false) + .action(async (paperId, options) => { + await handleInspectPaperCommand({ + paperId, + apiKey: options.apiKey, + apiUrl: options.apiUrl, + output: options.output, + json: options.json, + pretty: options.pretty, + }); + }); + + researchCmd + .command('related-papers') + .description( + 'Expand from anchor papers you have already found, via the citation graph, ranked and filtered to a natural-language intent. Pass arXiv ids of your strongest hits as seed ids. Modes: similar, citers, references. This reaches relevant papers that plain search misses. A similar call already runs a deep multi-round expansion internally.' + ) + .argument( + '', + 'Seed paper ids, e.g. arxiv:1706.03762 2014215642691656232' + ) + .requiredOption( + '--intent ', + 'Natural-language ranking/filtering intent' + ) + .option( + '--mode ', + 'Similarity mode: similar, citers, references (default: similar)' + ) + .option( + '--limit ', + 'Number of results to return (default: 40)', + parseInt + ) + .addOption(new Option('--k ').argParser(parseInt).hideHelp()) + .option( + '--rerank', + 'Apply an additional rerank over the fused candidates', + false + ) + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('--api-url ', 'API URL (overrides global --api-url)') + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--json', 'Output as compact JSON', false) + .option('--pretty', 'Pretty print JSON output', false) + .action(async (seedIds: string[], options) => { + const mode = options.mode as RelatedPapersOptions['mode'] | undefined; + if ( + mode !== undefined && + !['similar', 'citers', 'references'].includes(mode) + ) { + console.error( + 'Error: Invalid mode. Valid modes are: similar, citers, references' + ); + process.exit(1); + } + await handleRelatedPapersCommand({ + seedIds, + intent: options.intent, + mode, + k: researchLimit(options), + rerank: options.rerank, + apiKey: options.apiKey, + apiUrl: options.apiUrl, + output: options.output, + json: options.json, + pretty: options.pretty, + }); + }); + + researchCmd + .command('read-paper') + .description( + 'Read the most relevant in-body full-text passages of one specific paper for a question. Use this to verify whether a candidate actually satisfies a constraint before you include or reject it. Returns the best-matching passages, or a notice if the paper full text is unavailable.' + ) + .argument( + '', + 'Canonical paperId or primaryId such as arxiv:1706.03762, pmcid:PMC12530322, pmid:40953549, or doi:10.1016/j.neunet.2025.108095' + ) + .requiredOption( + '--question ', + 'Question to answer from the paper body' + ) + .option( + '--limit ', + 'Number of passages to return (default: 4)', + parseInt + ) + .addOption(new Option('--k ').argParser(parseInt).hideHelp()) + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('--api-url ', 'API URL (overrides global --api-url)') + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--json', 'Output as compact JSON', false) + .option('--pretty', 'Pretty print JSON output', false) + .action(async (paperId, options) => { + await handleReadPaperCommand({ + paperId, + question: options.question, + k: researchLimit(options), + apiKey: options.apiKey, + apiUrl: options.apiUrl, + output: options.output, + json: options.json, + pretty: options.pretty, + }); + }); + + researchCmd + .command('search-github') + .description( + 'Search GitHub issue/PR history and repository readmes. Returns ranked matches with repo, url, a short snippet, and when available the full matched content in markdown.' + ) + .argument('', 'GitHub history/readme search query') + .option( + '--limit ', + 'Number of results to return (max: 100)', + parseInt + ) + .addOption(new Option('--k ').argParser(parseInt).hideHelp()) + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('--api-url ', 'API URL (overrides global --api-url)') + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--json', 'Output as compact JSON', false) + .option('--pretty', 'Pretty print JSON output', false) + .action(async (query, options) => { + await handleSearchGitHubCommand({ + query, + k: researchLimit(options), + apiKey: options.apiKey, + apiUrl: options.apiUrl, + output: options.output, + json: options.json, + pretty: options.pretty, + }); + }); + + return researchCmd; +} + /** * Create the search-feedback command. Used by agents (CLI, MCP, skills) to * report search-result quality after a `firecrawl search` call. The first @@ -1769,6 +2027,7 @@ program.addCommand(createMapCommand()); program.addCommand(createParseCommand()); program.addCommand(createMonitorCommand()); program.addCommand(createSearchCommand()); +program.addCommand(createResearchCommand()); program.addCommand(createFeedbackCommand()); program.addCommand(createSearchFeedbackCommand()); program.addCommand(createAgentCommand()); diff --git a/src/types/research.ts b/src/types/research.ts new file mode 100644 index 0000000000..d4c122219f --- /dev/null +++ b/src/types/research.ts @@ -0,0 +1,66 @@ +export interface ResearchBaseOptions { + apiKey?: string; + apiUrl?: string; + output?: string; + json?: boolean; + pretty?: boolean; +} + +export interface SearchPapersOptions extends ResearchBaseOptions { + query: string; + k?: number; + authors?: string[]; + categories?: string[]; + from?: string; + to?: string; +} + +export interface InspectPaperOptions extends ResearchBaseOptions { + paperId: string; +} + +export interface RelatedPapersOptions extends ResearchBaseOptions { + seedIds: string[]; + intent: string; + mode?: 'similar' | 'citers' | 'references'; + k?: number; + rerank?: boolean; +} + +export interface ReadPaperOptions extends ResearchBaseOptions { + paperId: string; + question: string; + k?: number; +} + +export interface SearchGitHubOptions extends ResearchBaseOptions { + query: string; + k?: number; +} + +export interface PaperHit { + paperId?: string; + primaryId?: string; + ids?: Record; + title?: string; + abstract?: string; + authors?: string | { name: string; affiliation?: string }[]; + categories?: string[]; + createdDate?: string; + updateDate?: string; + signals?: Record; + score?: number; +} + +export interface GitHubItem { + resultType?: string; + repo?: string; + url?: string; + pageType?: string; + number?: number; + segmentCount?: number; + readmeUrl?: string; + title?: string; + snippet?: string; + contentMd?: string; +} diff --git a/src/utils/client.ts b/src/utils/client.ts index e6c738bbba..6519495bfc 100644 --- a/src/utils/client.ts +++ b/src/utils/client.ts @@ -48,6 +48,21 @@ export async function keylessRequest( return json; } +export async function keylessGet(path: string): Promise { + const apiUrl = (getConfig().apiUrl || DEFAULT_API_URL).replace(/\/$/, ''); + const response = await fetch(`${apiUrl}${path}`, { + method: 'GET', + headers: { 'Content-Type': 'application/json' }, + }); + const json: any = await response.json().catch(() => ({})); + if (!response.ok) { + throw new Error( + json?.error || `Firecrawl request failed (HTTP ${response.status})` + ); + } + return json; +} + /** * Get or create the Firecrawl client instance * Uses global configuration if available, otherwise creates with provided options