diff --git a/.github/workflows/test-java-sdk.yml b/.github/workflows/test-java-sdk.yml new file mode 100644 index 0000000000..c5bef8d6a2 --- /dev/null +++ b/.github/workflows/test-java-sdk.yml @@ -0,0 +1,70 @@ +name: Java SDK Test Suite + +on: + pull_request: + branches: + - main + paths: + - apps/java-sdk/** + - .github/workflows/test-java-sdk.yml + push: + branches: + - main + paths: + - apps/java-sdk/** + workflow_dispatch: + +jobs: + build-and-test: + name: Build and Test + runs-on: blacksmith-4vcpu-ubuntu-2404 + if: >- + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request' || + (github.event_name == 'push' && github.ref == 'refs/heads/main') + steps: + - uses: actions/checkout@v4 + + - name: Set up JDK 11 + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: "11" + + - name: Cache Gradle packages + uses: actions/cache@v4 + with: + path: | + ~/.gradle/caches + ~/.gradle/wrapper + key: ${{ runner.os }}-gradle-${{ hashFiles('apps/java-sdk/**/*.gradle.kts', 'apps/java-sdk/gradle/wrapper/gradle-wrapper.properties') }} + restore-keys: | + ${{ runner.os }}-gradle- + + - name: Grant execute permission for gradlew + working-directory: ./apps/java-sdk + run: chmod +x gradlew + + - name: Build + working-directory: ./apps/java-sdk + run: ./gradlew build -x test + + - name: Run unit tests + working-directory: ./apps/java-sdk + run: ./gradlew test + + - name: Run E2E tests + if: env.FIRECRAWL_API_KEY != '' + working-directory: ./apps/java-sdk + env: + FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} + run: ./gradlew test + + - name: Publish test report + if: always() && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.fork == false) + uses: dorny/test-reporter@v1 + with: + name: Java SDK Test Report + path: apps/java-sdk/build/test-results/test/*.xml + reporter: java-junit + fail-on-error: true diff --git a/apps/api/src/controllers/v2/browser.ts b/apps/api/src/controllers/v2/browser.ts index d2698db90c..8961268086 100644 --- a/apps/api/src/controllers/v2/browser.ts +++ b/apps/api/src/controllers/v2/browser.ts @@ -22,7 +22,7 @@ import { billTeam } from "../../services/billing/credit_billing"; import { enqueueBrowserSessionActivity } from "../../lib/browser-session-activity"; import { logRequest } from "../../services/logging/log_job"; -const BROWSER_CREDITS_PER_HOUR = 100; +const BROWSER_CREDITS_PER_HOUR = 120; /** * Calculate credits to bill for a browser session based on its duration. diff --git a/apps/api/src/lib/concurrency-limit.ts b/apps/api/src/lib/concurrency-limit.ts index 778ad0fe01..267c4738f8 100644 --- a/apps/api/src/lib/concurrency-limit.ts +++ b/apps/api/src/lib/concurrency-limit.ts @@ -173,7 +173,7 @@ export async function pushCrawlConcurrencyLimitActiveJob( ); } -async function removeCrawlConcurrencyLimitActiveJob( +export async function removeCrawlConcurrencyLimitActiveJob( crawl_id: string, id: string, ) { @@ -188,7 +188,7 @@ async function removeCrawlConcurrencyLimitActiveJob( * @param teamId * @returns A job that can be run, or null if there are no more jobs to run. */ -async function getNextConcurrentJob(teamId: string): Promise<{ +export async function getNextConcurrentJob(teamId: string): Promise<{ job: ConcurrencyLimitedJob; timeout: number; } | null> { @@ -283,6 +283,11 @@ async function getNextConcurrentJob(teamId: string): Promise<{ export async function concurrentJobDone(job: NuQJob) { if (job.id && job.data && job.data.team_id) { await removeConcurrencyLimitActiveJob(job.data.team_id, job.id); + await getRedisConnection().zrem( + constructQueueKey(job.data.team_id), + job.id, + ); + await getRedisConnection().del(constructJobKey(job.id)); await cleanOldConcurrencyLimitEntries(job.data.team_id); await cleanOldConcurrencyLimitedJobs(job.data.team_id); @@ -291,89 +296,92 @@ export async function concurrentJobDone(job: NuQJob) { await cleanOldCrawlConcurrencyLimitEntries(job.data.crawl_id); } - let i = 0; - for (; i < 10; i++) { - const maxTeamConcurrency = - ( - await getACUCTeam( - job.data.team_id, - false, - true, - job.data.is_extract - ? RateLimiterMode.Extract - : RateLimiterMode.Crawl, - ) - )?.concurrency ?? 2; + const maxTeamConcurrency = + ( + await getACUCTeam( + job.data.team_id, + false, + true, + job.data.is_extract ? RateLimiterMode.Extract : RateLimiterMode.Crawl, + ) + )?.concurrency ?? 2; + + let staleSkipped = 0; + while (staleSkipped < 100) { const currentActiveConcurrency = ( await getConcurrencyLimitActiveJobs(job.data.team_id) ).length; - if (currentActiveConcurrency < maxTeamConcurrency) { - const nextJob = await getNextConcurrentJob(job.data.team_id); - if (nextJob !== null) { - await pushConcurrencyLimitActiveJob( - job.data.team_id, - nextJob.job.id, - 60 * 1000, - ); + if (currentActiveConcurrency >= maxTeamConcurrency) break; - if (nextJob.job.data.crawl_id) { - await pushCrawlConcurrencyLimitActiveJob( - nextJob.job.data.crawl_id, - nextJob.job.id, - 60 * 1000, - ); - - const sc = await getCrawl(nextJob.job.data.crawl_id); - if (sc !== null && typeof sc.crawlerOptions?.delay === "number") { - await new Promise(resolve => - setTimeout(resolve, sc.crawlerOptions.delay * 1000), - ); - } - } + const nextJob = await getNextConcurrentJob(job.data.team_id); + if (nextJob === null) break; - abTestJob(nextJob.job.data); - - const promotedSuccessfully = - (await scrapeQueue.promoteJobFromBacklogOrAdd( - nextJob.job.id, - nextJob.job.data, - { - priority: nextJob.job.priority, - listenable: nextJob.job.listenable, - ownerId: nextJob.job.data.team_id ?? undefined, - groupId: nextJob.job.data.crawl_id ?? undefined, - }, - )) !== null; - - if (promotedSuccessfully) { - logger.debug("Successfully promoted concurrent queued job", { - teamId: job.data.team_id, - jobId: nextJob.job.id, - zeroDataRetention: nextJob.job.data?.zeroDataRetention, - }); - break; - } else { - logger.warn( - "Was unable to promote concurrent queued job as it already exists in the database", - { - teamId: job.data.team_id, - jobId: nextJob.job.id, - zeroDataRetention: nextJob.job.data?.zeroDataRetention, - }, - ); - } - } else { - break; + await pushConcurrencyLimitActiveJob( + job.data.team_id, + nextJob.job.id, + 60 * 1000, + ); + + if (nextJob.job.data.crawl_id) { + await pushCrawlConcurrencyLimitActiveJob( + nextJob.job.data.crawl_id, + nextJob.job.id, + 60 * 1000, + ); + + const sc = await getCrawl(nextJob.job.data.crawl_id); + if (sc !== null && typeof sc.crawlerOptions?.delay === "number") { + await new Promise(resolve => + setTimeout(resolve, sc.crawlerOptions.delay * 1000), + ); } - } else { + } + + abTestJob(nextJob.job.data); + + const promotedSuccessfully = + (await scrapeQueue.promoteJobFromBacklogOrAdd( + nextJob.job.id, + nextJob.job.data, + { + priority: nextJob.job.priority, + listenable: nextJob.job.listenable, + ownerId: nextJob.job.data.team_id ?? undefined, + groupId: nextJob.job.data.crawl_id ?? undefined, + }, + )) !== null; + + if (promotedSuccessfully) { + logger.debug("Successfully promoted concurrent queued job", { + teamId: job.data.team_id, + jobId: nextJob.job.id, + zeroDataRetention: nextJob.job.data?.zeroDataRetention, + }); break; + } else { + logger.warn( + "Was unable to promote concurrent queued job as it already exists in the database", + { + teamId: job.data.team_id, + jobId: nextJob.job.id, + zeroDataRetention: nextJob.job.data?.zeroDataRetention, + }, + ); + await removeConcurrencyLimitActiveJob(job.data.team_id, nextJob.job.id); + if (nextJob.job.data.crawl_id) { + await removeCrawlConcurrencyLimitActiveJob( + nextJob.job.data.crawl_id, + nextJob.job.id, + ); + } + staleSkipped++; } } - if (i === 10) { + if (staleSkipped >= 100) { logger.warn( - "Failed to promote a concurrent job after 10 iterations, bailing!", + "Skipped 100 stale entries in concurrency queue without a successful promotion", { teamId: job.data.team_id, }, diff --git a/apps/api/src/lib/concurrency-queue-reconciler.ts b/apps/api/src/lib/concurrency-queue-reconciler.ts index b22bf4e1e9..b6bb041ba4 100644 --- a/apps/api/src/lib/concurrency-queue-reconciler.ts +++ b/apps/api/src/lib/concurrency-queue-reconciler.ts @@ -1,13 +1,17 @@ import { Logger } from "winston"; +import { validate as isUUID } from "uuid"; import { getACUCTeam } from "../controllers/auth"; import { getRedisConnection } from "../services/queue-service"; import { scrapeQueue, type NuQJob } from "../services/worker/nuq"; import { RateLimiterMode, type ScrapeJobData } from "../types"; import { getConcurrencyLimitActiveJobs, + getNextConcurrentJob, pushConcurrencyLimitActiveJob, pushConcurrencyLimitedJob, pushCrawlConcurrencyLimitActiveJob, + removeConcurrencyLimitActiveJob, + removeCrawlConcurrencyLimitActiveJob, } from "./concurrency-limit"; import { getCrawl } from "./crawl-redis"; import { logger as _logger } from "./logger"; @@ -192,6 +196,105 @@ async function reconcileTeam( return { jobsStarted, jobsRequeued }; } +async function drainQueue( + ownerId: string, + teamLogger: Logger, +): Promise<{ jobsPromoted: number; staleSkipped: number }> { + const maxCrawlConcurrency = + (await getACUCTeam(ownerId, false, true, RateLimiterMode.Crawl)) + ?.concurrency ?? 2; + const maxExtractConcurrency = + (await getACUCTeam(ownerId, false, true, RateLimiterMode.Extract)) + ?.concurrency ?? 2; + + const activeIds = await getConcurrencyLimitActiveJobs(ownerId); + const activeJobs = await scrapeQueue.getJobs(activeIds, teamLogger); + let crawlCount = 0; + let extractCount = 0; + for (const aj of activeJobs) { + if (isExtractJob(aj.data)) extractCount++; + else crawlCount++; + } + + let jobsPromoted = 0; + let staleSkipped = 0; + let typeBlocked = 0; + + while (staleSkipped + typeBlocked < 100) { + if ( + crawlCount >= maxCrawlConcurrency && + extractCount >= maxExtractConcurrency + ) + break; + + const nextJob = await getNextConcurrentJob(ownerId); + if (nextJob === null) break; + + const isExtract = isExtractJob(nextJob.job.data); + const typeLimit = isExtract ? maxExtractConcurrency : maxCrawlConcurrency; + const typeCount = isExtract ? extractCount : crawlCount; + + if (typeCount >= typeLimit) { + await pushConcurrencyLimitedJob( + ownerId, + { + id: nextJob.job.id, + data: nextJob.job.data, + priority: nextJob.job.priority, + listenable: nextJob.job.listenable, + }, + nextJob.timeout === Infinity ? 172800000 : nextJob.timeout, + ); + typeBlocked++; + continue; + } + + await pushConcurrencyLimitActiveJob(ownerId, nextJob.job.id, 60 * 1000); + if (nextJob.job.data.crawl_id) { + await pushCrawlConcurrencyLimitActiveJob( + nextJob.job.data.crawl_id, + nextJob.job.id, + 60 * 1000, + ); + } + + const promoted = await scrapeQueue.promoteJobFromBacklogOrAdd( + nextJob.job.id, + nextJob.job.data, + { + priority: nextJob.job.priority, + listenable: nextJob.job.listenable, + ownerId: nextJob.job.data.team_id ?? undefined, + groupId: nextJob.job.data.crawl_id ?? undefined, + }, + ); + + if (promoted !== null) { + if (isExtract) extractCount++; + else crawlCount++; + jobsPromoted++; + } else { + await removeConcurrencyLimitActiveJob(ownerId, nextJob.job.id); + if (nextJob.job.data.crawl_id) { + await removeCrawlConcurrencyLimitActiveJob( + nextJob.job.data.crawl_id, + nextJob.job.id, + ); + } + staleSkipped++; + } + } + + if (staleSkipped >= 100) { + teamLogger.warn( + "Queue drain hit 100 stale entries without fully draining", + { ownerId }, + ); + } + + return { jobsPromoted, staleSkipped }; +} + export async function reconcileConcurrencyQueue( options: ReconcileOptions = {}, ): Promise { @@ -200,11 +303,21 @@ export async function reconcileConcurrencyQueue( scopedTeamId: options.teamId, }); - const owners = options.teamId - ? [options.teamId] - : await scrapeQueue.getBackloggedOwnerIDs(logger); - - const ownerIds = owners.filter((x): x is string => typeof x === "string"); + let ownerIds: string[]; + if (options.teamId) { + ownerIds = [options.teamId]; + } else { + const backlogOwners = ( + await scrapeQueue.getBackloggedOwnerIDs(logger) + ).filter((x): x is string => typeof x === "string"); + const queueKeys = await getRedisConnection().smembers( + "concurrency-limit-queues", + ); + const queueOwners = queueKeys + .map(k => k.replace("concurrency-limit-queue:", "")) + .filter(id => id.length > 0 && isUUID(id)); + ownerIds = [...new Set([...backlogOwners, ...queueOwners])]; + } const result: ReconcileResult = { teamsScanned: ownerIds.length, @@ -223,6 +336,15 @@ export async function reconcileConcurrencyQueue( result.jobsStarted += teamResult.jobsStarted; result.jobsRequeued += teamResult.jobsRequeued; } + + const drainResult = await drainQueue(ownerId, teamLogger); + if (drainResult.jobsPromoted > 0 || drainResult.staleSkipped > 0) { + result.jobsStarted += drainResult.jobsPromoted; + teamLogger.info("Queue drain promoted jobs", { + jobsPromoted: drainResult.jobsPromoted, + staleSkipped: drainResult.staleSkipped, + }); + } } catch (error) { teamLogger.error("Failed to reconcile team, skipping", { error }); } diff --git a/apps/java-sdk/.gitignore b/apps/java-sdk/.gitignore new file mode 100644 index 0000000000..8cc913d551 --- /dev/null +++ b/apps/java-sdk/.gitignore @@ -0,0 +1,17 @@ +.gradle/ +build/ +*.class +*.jar +!gradle/wrapper/gradle-wrapper.jar +*.war +*.ear +*.iml +.idea/ +*.ipr +*.iws +out/ +.settings/ +.classpath +.project +bin/ +local.properties diff --git a/apps/java-sdk/README.md b/apps/java-sdk/README.md new file mode 100644 index 0000000000..c205c738e6 --- /dev/null +++ b/apps/java-sdk/README.md @@ -0,0 +1,434 @@ +# Firecrawl Java SDK + +Java SDK for the [Firecrawl](https://firecrawl.dev) v2 web scraping API. + +## Prerequisites + +Before using the Java SDK, ensure you have the following installed: + +### Java Development Kit (JDK) + +- **Required:** Java 11 or later +- **Installation (macOS):** + ```bash + brew install openjdk + ``` + + Then add Java to your PATH: + ```bash + echo 'export PATH="/opt/homebrew/opt/openjdk/bin:$PATH"' >> ~/.zshrc + source ~/.zshrc + ``` + +- **Installation (Linux):** + ```bash + # Ubuntu/Debian + sudo apt-get update + sudo apt-get install openjdk-11-jdk + + # Fedora/RHEL + sudo dnf install java-11-openjdk-devel + ``` + +- **Verify Installation:** + ```bash + java --version + ``` + +### Gradle (for building from source) + +- **Required:** Gradle 8+ +- **Installation (macOS):** + ```bash + brew install gradle + ``` + +- **Installation (Linux):** + ```bash + # Ubuntu/Debian + sudo apt-get install gradle + + # Or use SDKMAN + curl -s "https://get.sdkman.io" | bash + sdk install gradle + ``` + +- **Verify Installation:** + ```bash + gradle --version + ``` + +### API Key Setup + +1. Get your API key from [Firecrawl Dashboard](https://firecrawl.dev) +2. Set it as an environment variable: + ```bash + export FIRECRAWL_API_KEY="fc-your-api-key-here" + ``` + +3. **Or** add it to your shell profile for persistence: + ```bash + # For Zsh (macOS/Linux) + echo 'export FIRECRAWL_API_KEY="fc-your-api-key-here"' >> ~/.zshrc + source ~/.zshrc + + # For Bash + echo 'export FIRECRAWL_API_KEY="fc-your-api-key-here"' >> ~/.bashrc + source ~/.bashrc + ``` + +## Installation + +### Gradle (Kotlin DSL) + +```kotlin +implementation("com.firecrawl:firecrawl-java:1.0.0") +``` + +### Gradle (Groovy) + +```groovy +implementation 'com.firecrawl:firecrawl-java:1.0.0' +``` + +### Maven + +```xml + + com.firecrawl + firecrawl-java + 1.0.0 + +``` + +## Quick Start + +```java +import com.firecrawl.client.FirecrawlClient; +import com.firecrawl.models.*; +import java.util.List; + +// Create client with explicit API key +FirecrawlClient client = FirecrawlClient.builder() + .apiKey("fc-your-api-key") + .build(); + +// Scrape a page +Document doc = client.scrape("https://example.com", + ScrapeOptions.builder() + .formats(List.of("markdown")) + .build()); + +System.out.println(doc.getMarkdown()); +``` + +Or create a client from the environment variable: + +```java +// export FIRECRAWL_API_KEY=fc-your-api-key +FirecrawlClient client = FirecrawlClient.fromEnv(); +``` + +## API Reference + +### Scrape + +Scrape a single URL and get the content in various formats. + +```java +Document doc = client.scrape("https://example.com", + ScrapeOptions.builder() + .formats(List.of("markdown", "html")) + .onlyMainContent(true) + .waitFor(5000) + .build()); + +System.out.println(doc.getMarkdown()); +System.out.println(doc.getMetadata().get("title")); +``` + +#### JSON Extraction + +```java +import com.firecrawl.models.JsonFormat; + +JsonFormat jsonFmt = JsonFormat.builder() + .prompt("Extract the product name and price") + .schema(Map.of( + "type", "object", + "properties", Map.of( + "name", Map.of("type", "string"), + "price", Map.of("type", "number") + ) + )) + .build(); + +Document doc = client.scrape("https://example.com/product", + ScrapeOptions.builder() + .formats(List.of(jsonFmt)) + .build()); + +System.out.println(doc.getJson()); +``` + +### Crawl + +Crawl an entire website. The `crawl()` method polls until completion. + +```java +// Convenience method — polls until done +CrawlJob job = client.crawl("https://example.com", + CrawlOptions.builder() + .limit(50) + .maxDiscoveryDepth(3) + .scrapeOptions(ScrapeOptions.builder() + .formats(List.of("markdown")) + .build()) + .build()); + +for (Document doc : job.getData()) { + System.out.println(doc.getMetadata().get("sourceURL")); +} +``` + +#### Async Crawl (manual polling) + +```java +CrawlResponse start = client.startCrawl("https://example.com", + CrawlOptions.builder().limit(100).build()); + +System.out.println("Job started: " + start.getId()); + +// Poll manually +CrawlJob status; +do { + try { Thread.sleep(2000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); break; } + status = client.getCrawlStatus(start.getId()); + System.out.println(status.getCompleted() + "/" + status.getTotal()); +} while (!status.isDone()); +``` + +### Batch Scrape + +Scrape multiple URLs in parallel. + +```java +BatchScrapeJob job = client.batchScrape( + List.of("https://example.com", "https://example.org"), + BatchScrapeOptions.builder() + .options(ScrapeOptions.builder() + .formats(List.of("markdown")) + .build()) + .build()); + +for (Document doc : job.getData()) { + System.out.println(doc.getMarkdown()); +} +``` + +### Map + +Discover all URLs on a website. + +```java +MapData data = client.map("https://example.com", + MapOptions.builder() + .limit(100) + .search("blog") + .build()); + +for (Map link : data.getLinks()) { + System.out.println(link.get("url") + " - " + link.get("title")); +} +``` + +### Search + +Search the web and optionally scrape results. + +```java +SearchData results = client.search("firecrawl web scraping", + SearchOptions.builder() + .limit(10) + .build()); + +if (results.getWeb() != null) { + for (Map result : results.getWeb()) { + System.out.println(result.get("title") + " — " + result.get("url")); + } +} +``` + +### Agent + +Run an AI-powered agent to research and extract data from the web. + +```java +AgentStatusResponse result = client.agent( + AgentOptions.builder() + .prompt("Find the pricing plans for Firecrawl and compare them") + .build()); + +System.out.println(result.getData()); +``` + +### Usage & Metrics + +```java +ConcurrencyCheck conc = client.getConcurrency(); +System.out.println("Concurrency: " + conc.getConcurrency() + "/" + conc.getMaxConcurrency()); + +CreditUsage credits = client.getCreditUsage(); +System.out.println("Remaining credits: " + credits.getRemainingCredits()); +``` + +## Async Support + +All methods have async variants that return `CompletableFuture`: + +```java +import java.util.concurrent.CompletableFuture; + +CompletableFuture future = client.scrapeAsync( + "https://example.com", + ScrapeOptions.builder().formats(List.of("markdown")).build()); + +future.thenAccept(doc -> System.out.println(doc.getMarkdown())); +``` + +## Error Handling + +The SDK throws unchecked exceptions: + +```java +import com.firecrawl.errors.*; + +try { + Document doc = client.scrape("https://example.com"); +} catch (AuthenticationException e) { + // 401 — invalid API key + System.err.println("Auth failed: " + e.getMessage()); +} catch (RateLimitException e) { + // 429 — too many requests + System.err.println("Rate limited: " + e.getMessage()); +} catch (JobTimeoutException e) { + // Async job timed out + System.err.println("Job " + e.getJobId() + " timed out after " + e.getTimeoutSeconds() + "s"); +} catch (FirecrawlException e) { + // All other API errors + System.err.println("Error " + e.getStatusCode() + ": " + e.getMessage()); +} +``` + +## Configuration + +```java +FirecrawlClient client = FirecrawlClient.builder() + .apiKey("fc-your-api-key") // Required (or set FIRECRAWL_API_KEY env var) + .apiUrl("https://api.firecrawl.dev") // Optional (or set FIRECRAWL_API_URL env var) + .timeoutMs(300_000) // HTTP timeout: 5 min default + .maxRetries(3) // Auto-retries for transient failures + .backoffFactor(0.5) // Exponential backoff factor (seconds) + .asyncExecutor(myExecutor) // Custom executor for async methods + .build(); +``` + +## Building from Source + +### Clone and Build + +```bash +# Clone the repository (if you haven't already) +git clone https://github.com/mendableai/firecrawl.git +cd firecrawl/apps/java-sdk + +# Build the project +gradle build +``` + +### Generate JAR + +```bash +gradle jar +# Output: build/libs/firecrawl-java-1.0.0.jar +``` + +### Install Locally + +```bash +gradle publishToMavenLocal +# Now available as: com.firecrawl:firecrawl-java:1.0.0 in local Maven repository +``` + +## Running Tests + +The SDK includes both unit tests and E2E integration tests. + +### Unit Tests (No API Key Required) + +Unit tests verify SDK functionality without making actual API calls: + +```bash +gradle test +``` + +### E2E Integration Tests (API Key Required) + +E2E tests make real API calls and require a valid API key. These tests will be **skipped** if `FIRECRAWL_API_KEY` is not set: + +```bash +# Set your API key +export FIRECRAWL_API_KEY="fc-your-api-key-here" + +# Run all tests including E2E +gradle test +``` + +### Run Specific Tests + +```bash +# Run only scrape tests +gradle test --tests "*testScrape*" + +# Run only E2E tests +gradle test --tests "*E2E" + +# Run specific test class +gradle test --tests "com.firecrawl.FirecrawlClientTest" +``` + +### View Test Results + +After running tests, view the detailed report: + +```bash +open build/reports/tests/test/index.html # macOS +xdg-open build/reports/tests/test/index.html # Linux +``` + +## Development Setup + +If you're contributing to the SDK or testing local changes: + +1. **Install Prerequisites** (see Prerequisites section above) + +2. **Set Environment Variables:** + ```bash + export FIRECRAWL_API_KEY="fc-your-api-key" + # Optional: use local API server + export FIRECRAWL_API_URL="http://localhost:3002" + ``` + +3. **Build and Test:** + ```bash + gradle clean build test + ``` + +4. **Make Changes and Retest:** + ```bash + # Quick compilation check + gradle compileJava + + # Run tests + gradle test --tests "*testYourFeature*" + ``` diff --git a/apps/java-sdk/build.gradle.kts b/apps/java-sdk/build.gradle.kts new file mode 100644 index 0000000000..3ec9671476 --- /dev/null +++ b/apps/java-sdk/build.gradle.kts @@ -0,0 +1,67 @@ +plugins { + `java-library` + `maven-publish` +} + +group = "com.firecrawl" +version = "1.0.0" + +java { + sourceCompatibility = JavaVersion.VERSION_11 + targetCompatibility = JavaVersion.VERSION_11 + withSourcesJar() + withJavadocJar() +} + +repositories { + mavenCentral() +} + +dependencies { + api("com.squareup.okhttp3:okhttp:4.12.0") + api("com.fasterxml.jackson.core:jackson-databind:2.17.2") + api("com.fasterxml.jackson.core:jackson-annotations:2.17.2") + api("com.fasterxml.jackson.datatype:jackson-datatype-jdk8:2.17.2") + + testImplementation("org.junit.jupiter:junit-jupiter:5.10.3") + testRuntimeOnly("org.junit.platform:junit-platform-launcher:1.10.3") +} + +tasks.test { + useJUnitPlatform() +} + +tasks.withType { + options { + (this as StandardJavadocDocletOptions).apply { + addStringOption("Xdoclint:none", "-quiet") + } + } +} + +publishing { + publications { + create("mavenJava") { + from(components["java"]) + + pom { + name.set("Firecrawl Java SDK") + description.set("Java SDK for the Firecrawl web scraping API") + url.set("https://github.com/mendableai/firecrawl") + + licenses { + license { + name.set("MIT License") + url.set("https://opensource.org/licenses/MIT") + } + } + + scm { + url.set("https://github.com/mendableai/firecrawl") + connection.set("scm:git:git://github.com/mendableai/firecrawl.git") + developerConnection.set("scm:git:ssh://github.com/mendableai/firecrawl.git") + } + } + } + } +} diff --git a/apps/java-sdk/gradle/wrapper/gradle-wrapper.jar b/apps/java-sdk/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 0000000000..a4b76b9530 Binary files /dev/null and b/apps/java-sdk/gradle/wrapper/gradle-wrapper.jar differ diff --git a/apps/java-sdk/gradle/wrapper/gradle-wrapper.properties b/apps/java-sdk/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 0000000000..9355b41557 --- /dev/null +++ b/apps/java-sdk/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,7 @@ +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip +networkTimeout=10000 +validateDistributionUrl=true +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists diff --git a/apps/java-sdk/gradlew b/apps/java-sdk/gradlew new file mode 100755 index 0000000000..4c8789e1c5 --- /dev/null +++ b/apps/java-sdk/gradlew @@ -0,0 +1,120 @@ +#!/bin/sh + +# +# Copyright © 2015-2021 the original authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +# +# Gradle start up script for POSIX generated by Gradle. +# +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +app_path=$0 +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld -- "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac +done + +# This is normally unused +# shellcheck disable=SC2034 +APP_BASE_NAME=${0##*/} +# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) +APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD=maximum + +warn () { + echo "$*" +} >&2 + +die () { + echo + echo "$*" + echo + exit 1 +} >&2 + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NonStop* ) nonstop=true ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD=$JAVA_HOME/jre/sh/java + else + JAVACMD=$JAVA_HOME/bin/java + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD=java + if ! command -v java >/dev/null 2>&1 ; then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +fi + +# Increase the maximum file descriptors if we can. +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + # In POSIX sh, ulimit -H is://undefined. That's://why the redirect is://done to /dev/null 2>&1 + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + ;; + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + # In POSIX sh, ulimit -n is://undefined. That's://why the redirect is://done to /dev/null 2>&1 + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + ;; + esac +fi + +# Collect all arguments for the java command, stracks://the style://of://arguments://after://the://class name +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$@" + +exec "$JAVACMD" "$@" diff --git a/apps/java-sdk/settings.gradle.kts b/apps/java-sdk/settings.gradle.kts new file mode 100644 index 0000000000..d9c10b0235 --- /dev/null +++ b/apps/java-sdk/settings.gradle.kts @@ -0,0 +1 @@ +rootProject.name = "firecrawl-java" diff --git a/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlClient.java b/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlClient.java new file mode 100644 index 0000000000..efbb4724e9 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlClient.java @@ -0,0 +1,841 @@ +package com.firecrawl.client; + +import com.firecrawl.errors.FirecrawlException; +import com.firecrawl.errors.JobTimeoutException; +import com.firecrawl.models.*; + +import java.util.*; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executor; +import java.util.concurrent.ForkJoinPool; + +/** + * Client for the Firecrawl v2 API. + * + *

Example usage: + *

{@code
+ * FirecrawlClient client = FirecrawlClient.builder()
+ *     .apiKey("fc-your-api-key")
+ *     .build();
+ *
+ * // Scrape a single page
+ * Document doc = client.scrape("https://example.com",
+ *     ScrapeOptions.builder()
+ *         .formats(List.of("markdown"))
+ *         .build());
+ *
+ * // Crawl a website
+ * CrawlJob job = client.crawl("https://example.com",
+ *     CrawlOptions.builder()
+ *         .limit(50)
+ *         .build());
+ * }
+ */ +public class FirecrawlClient { + + private static final String DEFAULT_API_URL = "https://api.firecrawl.dev"; + private static final long DEFAULT_TIMEOUT_MS = 300_000; // 5 minutes + private static final int DEFAULT_MAX_RETRIES = 3; + private static final double DEFAULT_BACKOFF_FACTOR = 0.5; + private static final int DEFAULT_POLL_INTERVAL = 2; // seconds + private static final int DEFAULT_JOB_TIMEOUT = 300; // seconds + + private final FirecrawlHttpClient http; + private final Executor asyncExecutor; + + private FirecrawlClient(FirecrawlHttpClient http, Executor asyncExecutor) { + this.http = http; + this.asyncExecutor = asyncExecutor; + } + + /** + * Creates a new builder for constructing a FirecrawlClient. + */ + public static Builder builder() { + return new Builder(); + } + + /** + * Creates a client from the FIRECRAWL_API_KEY environment variable. + */ + public static FirecrawlClient fromEnv() { + String apiKey = System.getenv("FIRECRAWL_API_KEY"); + if (apiKey == null || apiKey.isBlank()) { + String sysProp = System.getProperty("firecrawl.apiKey"); + if (sysProp == null || sysProp.isBlank()) { + throw new FirecrawlException("FIRECRAWL_API_KEY environment variable or firecrawl.apiKey system property is required"); + } + apiKey = sysProp; + } + return builder().apiKey(apiKey).build(); + } + + // ================================================================ + // SCRAPE + // ================================================================ + + /** + * Scrapes a single URL and returns the document. + * + * @param url the URL to scrape + * @return the scraped document + */ + public Document scrape(String url) { + return scrape(url, null); + } + + /** + * Scrapes a single URL with options. + * + * @param url the URL to scrape + * @param options scrape configuration options + * @return the scraped document + */ + public Document scrape(String url, ScrapeOptions options) { + Objects.requireNonNull(url, "URL is required"); + Map body = new LinkedHashMap<>(); + body.put("url", url); + if (options != null) { + mergeOptions(body, options); + } + return extractData(http.post("/v2/scrape", body, Map.class), Document.class); + } + + // ================================================================ + // CRAWL + // ================================================================ + + /** + * Starts an async crawl job and returns immediately. + * + * @param url the URL to start crawling from + * @param options crawl configuration options + * @return the crawl job reference with ID + */ + public CrawlResponse startCrawl(String url, CrawlOptions options) { + Objects.requireNonNull(url, "URL is required"); + Map body = new LinkedHashMap<>(); + body.put("url", url); + if (options != null) { + mergeOptions(body, options); + } + return http.post("/v2/crawl", body, CrawlResponse.class); + } + + /** + * Gets the status and results of a crawl job. + * + * @param jobId the crawl job ID + * @return the crawl job status + */ + public CrawlJob getCrawlStatus(String jobId) { + Objects.requireNonNull(jobId, "Job ID is required"); + return http.get("/v2/crawl/" + jobId, CrawlJob.class); + } + + /** + * Crawls a website and waits for completion (auto-polling). + * + * @param url the URL to crawl + * @param options crawl configuration options + * @return the completed crawl job with all documents + */ + public CrawlJob crawl(String url, CrawlOptions options) { + return crawl(url, options, DEFAULT_POLL_INTERVAL, DEFAULT_JOB_TIMEOUT); + } + + /** + * Crawls a website and waits for completion with custom polling settings. + * + * @param url the URL to crawl + * @param options crawl configuration options + * @param pollIntervalSec seconds between status checks + * @param timeoutSec maximum seconds to wait + * @return the completed crawl job with all documents + */ + public CrawlJob crawl(String url, CrawlOptions options, int pollIntervalSec, int timeoutSec) { + CrawlResponse start = startCrawl(url, options); + return pollCrawl(start.getId(), pollIntervalSec, timeoutSec); + } + + /** + * Cancels a running crawl job. + * + * @param jobId the crawl job ID + * @return the cancellation response + */ + @SuppressWarnings("unchecked") + public Map cancelCrawl(String jobId) { + Objects.requireNonNull(jobId, "Job ID is required"); + return http.delete("/v2/crawl/" + jobId, Map.class); + } + + /** + * Gets errors from a crawl job. + * + * @param jobId the crawl job ID + * @return error details + */ + @SuppressWarnings("unchecked") + public Map getCrawlErrors(String jobId) { + Objects.requireNonNull(jobId, "Job ID is required"); + return http.get("/v2/crawl/" + jobId + "/errors", Map.class); + } + + // ================================================================ + // BATCH SCRAPE + // ================================================================ + + /** + * Starts an async batch scrape job. + * + * @param urls the URLs to scrape + * @param options batch scrape configuration options + * @return the batch job reference with ID + */ + @SuppressWarnings("unchecked") + public BatchScrapeResponse startBatchScrape(List urls, BatchScrapeOptions options) { + Objects.requireNonNull(urls, "URLs list is required"); + Map body = new LinkedHashMap<>(); + body.put("urls", urls); + Map extraHeaders = Collections.emptyMap(); + if (options != null) { + // Extract idempotencyKey before serialization — it must be sent as an + // HTTP header (x-idempotency-key), not in the JSON body. + String idempotencyKey = options.getIdempotencyKey(); + if (idempotencyKey != null && !idempotencyKey.isEmpty()) { + extraHeaders = Collections.singletonMap("x-idempotency-key", idempotencyKey); + } + + mergeOptions(body, options); + // The API expects scrape options flattened at the top level, not nested + // under an "options" key. Extract and flatten them, but preserve + // batch-level fields so they are not overwritten by scrape options. + Map nested = (Map) body.remove("options"); + if (nested != null) { + Map batchFields = new LinkedHashMap<>(body); + body.putAll(nested); + body.putAll(batchFields); + } + } + return http.post("/v2/batch/scrape", body, BatchScrapeResponse.class, extraHeaders); + } + + /** + * Gets the status and results of a batch scrape job. + * + * @param jobId the batch scrape job ID + * @return the batch scrape job status + */ + public BatchScrapeJob getBatchScrapeStatus(String jobId) { + Objects.requireNonNull(jobId, "Job ID is required"); + return http.get("/v2/batch/scrape/" + jobId, BatchScrapeJob.class); + } + + /** + * Batch-scrapes URLs and waits for completion (auto-polling). + * + * @param urls the URLs to scrape + * @param options batch scrape configuration options + * @return the completed batch scrape job with all documents + */ + public BatchScrapeJob batchScrape(List urls, BatchScrapeOptions options) { + return batchScrape(urls, options, DEFAULT_POLL_INTERVAL, DEFAULT_JOB_TIMEOUT); + } + + /** + * Batch-scrapes URLs and waits for completion with custom polling settings. + * + * @param urls the URLs to scrape + * @param options batch scrape configuration options + * @param pollIntervalSec seconds between status checks + * @param timeoutSec maximum seconds to wait + * @return the completed batch scrape job with all documents + */ + public BatchScrapeJob batchScrape(List urls, BatchScrapeOptions options, + int pollIntervalSec, int timeoutSec) { + BatchScrapeResponse start = startBatchScrape(urls, options); + return pollBatchScrape(start.getId(), pollIntervalSec, timeoutSec); + } + + /** + * Cancels a running batch scrape job. + * + * @param jobId the batch scrape job ID + * @return the cancellation response + */ + @SuppressWarnings("unchecked") + public Map cancelBatchScrape(String jobId) { + Objects.requireNonNull(jobId, "Job ID is required"); + return http.delete("/v2/batch/scrape/" + jobId, Map.class); + } + + // ================================================================ + // MAP + // ================================================================ + + /** + * Discovers URLs on a website. + * + * @param url the URL to map + * @return the discovered URLs + */ + public MapData map(String url) { + return map(url, null); + } + + /** + * Discovers URLs on a website with options. + * + * @param url the URL to map + * @param options map configuration options + * @return the discovered URLs + */ + public MapData map(String url, MapOptions options) { + Objects.requireNonNull(url, "URL is required"); + Map body = new LinkedHashMap<>(); + body.put("url", url); + if (options != null) { + mergeOptions(body, options); + } + return extractData(http.post("/v2/map", body, Map.class), MapData.class); + } + + // ================================================================ + // SEARCH + // ================================================================ + + /** + * Performs a web search. + * + * @param query the search query + * @return search results + */ + public SearchData search(String query) { + return search(query, null); + } + + /** + * Performs a web search with options. + * + * @param query the search query + * @param options search configuration options + * @return search results + */ + public SearchData search(String query, SearchOptions options) { + Objects.requireNonNull(query, "Query is required"); + Map body = new LinkedHashMap<>(); + body.put("query", query); + if (options != null) { + mergeOptions(body, options); + } + return extractData(http.post("/v2/search", body, Map.class), SearchData.class); + } + + // ================================================================ + // AGENT + // ================================================================ + + /** + * Starts an async agent task. + * + * @param options agent configuration options + * @return the agent response with job ID + */ + public AgentResponse startAgent(AgentOptions options) { + Objects.requireNonNull(options, "Agent options are required"); + return http.post("/v2/agent", options, AgentResponse.class); + } + + /** + * Gets the status of an agent task. + * + * @param jobId the agent job ID + * @return the agent status response + */ + public AgentStatusResponse getAgentStatus(String jobId) { + Objects.requireNonNull(jobId, "Job ID is required"); + return http.get("/v2/agent/" + jobId, AgentStatusResponse.class); + } + + /** + * Runs an agent task and waits for completion (auto-polling). + * + * @param options agent configuration options + * @return the completed agent status response + */ + public AgentStatusResponse agent(AgentOptions options) { + return agent(options, DEFAULT_POLL_INTERVAL, DEFAULT_JOB_TIMEOUT); + } + + /** + * Runs an agent task and waits for completion with custom polling settings. + * + * @param options agent configuration options + * @param pollIntervalSec seconds between status checks + * @param timeoutSec maximum seconds to wait + * @return the completed agent status response + */ + public AgentStatusResponse agent(AgentOptions options, int pollIntervalSec, int timeoutSec) { + AgentResponse start = startAgent(options); + if (start.getId() == null) { + throw new FirecrawlException("Agent start did not return a job ID"); + } + long deadline = System.currentTimeMillis() + (timeoutSec * 1000L); + while (System.currentTimeMillis() < deadline) { + AgentStatusResponse status = getAgentStatus(start.getId()); + if (status.isDone()) { + return status; + } + sleep(pollIntervalSec); + } + throw new JobTimeoutException(start.getId(), timeoutSec, "Agent"); + } + + /** + * Cancels a running agent task. + * + * @param jobId the agent job ID + * @return the cancellation response + */ + @SuppressWarnings("unchecked") + public Map cancelAgent(String jobId) { + Objects.requireNonNull(jobId, "Job ID is required"); + return http.delete("/v2/agent/" + jobId, Map.class); + } + + // ================================================================ + // BROWSER + // ================================================================ + + /** + * Creates a new browser session with default settings. + * + * @return the browser session details including id, CDP URL, and live view URL + */ + public BrowserCreateResponse browser() { + return browser(null, null, null); + } + + /** + * Creates a new browser session with options. + * + * @param ttl total session lifetime in seconds (30-3600), or null for default + * @param activityTtl idle timeout in seconds (10-3600), or null for default + * @param streamWebView whether to enable live view streaming, or null for default + * @return the browser session details + */ + public BrowserCreateResponse browser(Integer ttl, Integer activityTtl, Boolean streamWebView) { + Map body = new LinkedHashMap<>(); + if (ttl != null) body.put("ttl", ttl); + if (activityTtl != null) body.put("activityTtl", activityTtl); + if (streamWebView != null) body.put("streamWebView", streamWebView); + return http.post("/v2/browser", body, BrowserCreateResponse.class); + } + + /** + * Executes code in a browser session using the default language (bash). + * + * @param sessionId the browser session ID + * @param code the code to execute + * @return the execution result including stdout, stderr, and exit code + */ + public BrowserExecuteResponse browserExecute(String sessionId, String code) { + return browserExecute(sessionId, code, "bash", null); + } + + /** + * Executes code in a browser session with options. + * + * @param sessionId the browser session ID + * @param code the code to execute + * @param language the language: "python", "node", or "bash" (default: "bash") + * @param timeout execution timeout in seconds (1-300), or null for default (30) + * @return the execution result including stdout, stderr, and exit code + */ + public BrowserExecuteResponse browserExecute(String sessionId, String code, + String language, Integer timeout) { + Objects.requireNonNull(sessionId, "Session ID is required"); + Objects.requireNonNull(code, "Code is required"); + Map body = new LinkedHashMap<>(); + body.put("code", code); + body.put("language", language != null ? language : "bash"); + if (timeout != null) body.put("timeout", timeout); + return http.post("/v2/browser/" + sessionId + "/execute", body, BrowserExecuteResponse.class); + } + + /** + * Deletes a browser session. + * + * @param sessionId the browser session ID + * @return the deletion response with session duration and billing info + */ + public BrowserDeleteResponse deleteBrowser(String sessionId) { + Objects.requireNonNull(sessionId, "Session ID is required"); + return http.delete("/v2/browser/" + sessionId, BrowserDeleteResponse.class); + } + + /** + * Lists all browser sessions. + * + * @return the list of browser sessions + */ + public BrowserListResponse listBrowsers() { + return listBrowsers(null); + } + + /** + * Lists browser sessions with optional status filter. + * + * @param status optional filter: "active" or "destroyed", or null for all + * @return the list of browser sessions + */ + public BrowserListResponse listBrowsers(String status) { + String endpoint = "/v2/browser"; + if (status != null && !status.isEmpty()) { + endpoint += "?status=" + status; + } + return http.get(endpoint, BrowserListResponse.class); + } + + // ================================================================ + // USAGE & METRICS + // ================================================================ + + /** + * Gets current concurrency usage. + */ + public ConcurrencyCheck getConcurrency() { + return http.get("/v2/concurrency-check", ConcurrencyCheck.class); + } + + /** + * Gets current credit usage. + */ + public CreditUsage getCreditUsage() { + return http.get("/v2/team/credit-usage", CreditUsage.class); + } + + // ================================================================ + // ASYNC CONVENIENCE METHODS + // ================================================================ + + /** + * Asynchronously scrapes a URL. + * + * @param url the URL to scrape + * @param options scrape configuration options + * @return a CompletableFuture that resolves to the scraped Document + */ + public CompletableFuture scrapeAsync(String url, ScrapeOptions options) { + return CompletableFuture.supplyAsync(() -> scrape(url, options), asyncExecutor); + } + + /** + * Asynchronously crawls a website and waits for completion. + * + * @param url the URL to crawl + * @param options crawl configuration options + * @return a CompletableFuture that resolves to the completed CrawlJob + */ + public CompletableFuture crawlAsync(String url, CrawlOptions options) { + return CompletableFuture.supplyAsync(() -> crawl(url, options), asyncExecutor); + } + + /** + * Asynchronously crawls with custom polling settings. + * + * @param url the URL to crawl + * @param options crawl configuration options + * @param pollIntervalSec seconds between status checks + * @param timeoutSec maximum seconds to wait + * @return a CompletableFuture that resolves to the completed CrawlJob + */ + public CompletableFuture crawlAsync(String url, CrawlOptions options, + int pollIntervalSec, int timeoutSec) { + return CompletableFuture.supplyAsync(() -> crawl(url, options, pollIntervalSec, timeoutSec), asyncExecutor); + } + + /** + * Asynchronously batch-scrapes URLs and waits for completion. + * + * @param urls the URLs to scrape + * @param options batch scrape configuration options + * @return a CompletableFuture that resolves to the completed BatchScrapeJob + */ + public CompletableFuture batchScrapeAsync(List urls, BatchScrapeOptions options) { + return CompletableFuture.supplyAsync(() -> batchScrape(urls, options), asyncExecutor); + } + + /** + * Asynchronously runs a search. + * + * @param query the search query + * @param options search configuration options + * @return a CompletableFuture that resolves to the SearchData + */ + public CompletableFuture searchAsync(String query, SearchOptions options) { + return CompletableFuture.supplyAsync(() -> search(query, options), asyncExecutor); + } + + /** + * Asynchronously runs a map operation. + * + * @param url the URL to map + * @param options map configuration options + * @return a CompletableFuture that resolves to the MapData + */ + public CompletableFuture mapAsync(String url, MapOptions options) { + return CompletableFuture.supplyAsync(() -> map(url, options), asyncExecutor); + } + + /** + * Asynchronously runs an agent task and waits for completion. + * + * @param options agent configuration options + * @return a CompletableFuture that resolves to the AgentStatusResponse + */ + public CompletableFuture agentAsync(AgentOptions options) { + return CompletableFuture.supplyAsync(() -> agent(options), asyncExecutor); + } + + /** + * Asynchronously creates a new browser session. + * + * @param ttl total session lifetime in seconds, or null for default + * @param activityTtl idle timeout in seconds, or null for default + * @param streamWebView whether to enable live view streaming, or null for default + * @return a CompletableFuture that resolves to the BrowserCreateResponse + */ + public CompletableFuture browserAsync(Integer ttl, Integer activityTtl, + Boolean streamWebView) { + return CompletableFuture.supplyAsync(() -> browser(ttl, activityTtl, streamWebView), asyncExecutor); + } + + /** + * Asynchronously executes code in a browser session. + * + * @param sessionId the browser session ID + * @param code the code to execute + * @param language the language: "python", "node", or "bash" + * @param timeout execution timeout in seconds, or null for default + * @return a CompletableFuture that resolves to the BrowserExecuteResponse + */ + public CompletableFuture browserExecuteAsync(String sessionId, String code, + String language, Integer timeout) { + return CompletableFuture.supplyAsync(() -> browserExecute(sessionId, code, language, timeout), asyncExecutor); + } + + /** + * Asynchronously deletes a browser session. + * + * @param sessionId the browser session ID + * @return a CompletableFuture that resolves to the BrowserDeleteResponse + */ + public CompletableFuture deleteBrowserAsync(String sessionId) { + return CompletableFuture.supplyAsync(() -> deleteBrowser(sessionId), asyncExecutor); + } + + /** + * Asynchronously lists browser sessions. + * + * @param status optional filter: "active" or "destroyed", or null for all + * @return a CompletableFuture that resolves to the BrowserListResponse + */ + public CompletableFuture listBrowsersAsync(String status) { + return CompletableFuture.supplyAsync(() -> listBrowsers(status), asyncExecutor); + } + + // ================================================================ + // INTERNAL POLLING HELPERS + // ================================================================ + + private CrawlJob pollCrawl(String jobId, int pollIntervalSec, int timeoutSec) { + long deadline = System.currentTimeMillis() + (timeoutSec * 1000L); + while (System.currentTimeMillis() < deadline) { + CrawlJob job = getCrawlStatus(jobId); + if (job.isDone()) { + return paginateCrawl(job); + } + sleep(pollIntervalSec); + } + throw new JobTimeoutException(jobId, timeoutSec, "Crawl"); + } + + private BatchScrapeJob pollBatchScrape(String jobId, int pollIntervalSec, int timeoutSec) { + long deadline = System.currentTimeMillis() + (timeoutSec * 1000L); + while (System.currentTimeMillis() < deadline) { + BatchScrapeJob job = getBatchScrapeStatus(jobId); + if (job.isDone()) { + return paginateBatchScrape(job); + } + sleep(pollIntervalSec); + } + throw new JobTimeoutException(jobId, timeoutSec, "Batch scrape"); + } + + /** + * Auto-paginates crawl results by following the "next" cursor. + */ + private CrawlJob paginateCrawl(CrawlJob job) { + if (job.getData() == null) { + job.setData(new ArrayList<>()); + } + CrawlJob current = job; + while (current.getNext() != null && !current.getNext().isEmpty()) { + CrawlJob nextPage = http.getAbsolute(current.getNext(), CrawlJob.class); + if (nextPage.getData() != null && !nextPage.getData().isEmpty()) { + job.getData().addAll(nextPage.getData()); + } + current = nextPage; + } + return job; + } + + /** + * Auto-paginates batch scrape results by following the "next" cursor. + */ + private BatchScrapeJob paginateBatchScrape(BatchScrapeJob job) { + if (job.getData() == null) { + job.setData(new ArrayList<>()); + } + BatchScrapeJob current = job; + while (current.getNext() != null && !current.getNext().isEmpty()) { + BatchScrapeJob nextPage = http.getAbsolute(current.getNext(), BatchScrapeJob.class); + if (nextPage.getData() != null && !nextPage.getData().isEmpty()) { + job.getData().addAll(nextPage.getData()); + } + current = nextPage; + } + return job; + } + + // ================================================================ + // INTERNAL UTILITIES + // ================================================================ + + /** + * Extracts the "data" field from a raw API response map and deserializes it. + */ + @SuppressWarnings("unchecked") + private T extractData(Map rawResponse, Class type) { + Object data = rawResponse.get("data"); + if (data == null) { + // Some endpoints return the data at the top level + return http.objectMapper.convertValue(rawResponse, type); + } + return http.objectMapper.convertValue(data, type); + } + + /** + * Merges a typed options object into a request body map, using Jackson serialization. + */ + @SuppressWarnings("unchecked") + private void mergeOptions(Map body, Object options) { + Map optionsMap = http.objectMapper.convertValue(options, Map.class); + body.putAll(optionsMap); + } + + private void sleep(int seconds) { + try { + Thread.sleep(seconds * 1000L); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new FirecrawlException("Polling interrupted", e); + } + } + + // ================================================================ + // BUILDER + // ================================================================ + + public static final class Builder { + + private String apiKey; + private String apiUrl = DEFAULT_API_URL; + private long timeoutMs = DEFAULT_TIMEOUT_MS; + private int maxRetries = DEFAULT_MAX_RETRIES; + private double backoffFactor = DEFAULT_BACKOFF_FACTOR; + private Executor asyncExecutor; + + private Builder() {} + + /** + * Sets the API key. Falls back to FIRECRAWL_API_KEY env var or + * firecrawl.apiKey system property if not provided. + */ + public Builder apiKey(String apiKey) { + this.apiKey = apiKey; + return this; + } + + /** + * Sets the API base URL. Defaults to https://api.firecrawl.dev. + * Falls back to FIRECRAWL_API_URL env var if not provided. + */ + public Builder apiUrl(String apiUrl) { + this.apiUrl = apiUrl; + return this; + } + + /** + * Sets the HTTP request timeout in milliseconds. Default: 300000 (5 minutes). + */ + public Builder timeoutMs(long timeoutMs) { + this.timeoutMs = timeoutMs; + return this; + } + + /** + * Sets the maximum number of automatic retries for transient failures. Default: 3. + */ + public Builder maxRetries(int maxRetries) { + this.maxRetries = maxRetries; + return this; + } + + /** + * Sets the exponential backoff factor in seconds. Default: 0.5. + */ + public Builder backoffFactor(double backoffFactor) { + this.backoffFactor = backoffFactor; + return this; + } + + /** + * Sets a custom executor for async operations. Default: ForkJoinPool.commonPool(). + */ + public Builder asyncExecutor(Executor asyncExecutor) { + this.asyncExecutor = asyncExecutor; + return this; + } + + public FirecrawlClient build() { + String resolvedKey = apiKey; + if (resolvedKey == null || resolvedKey.isBlank()) { + resolvedKey = System.getenv("FIRECRAWL_API_KEY"); + } + if (resolvedKey == null || resolvedKey.isBlank()) { + resolvedKey = System.getProperty("firecrawl.apiKey"); + } + if (resolvedKey == null || resolvedKey.isBlank()) { + throw new FirecrawlException( + "API key is required. Set it via builder.apiKey(), " + + "FIRECRAWL_API_KEY environment variable, or firecrawl.apiKey system property."); + } + + String resolvedUrl = apiUrl; + if (resolvedUrl == null || resolvedUrl.equals(DEFAULT_API_URL)) { + String envUrl = System.getenv("FIRECRAWL_API_URL"); + if (envUrl != null && !envUrl.isEmpty()) { + resolvedUrl = envUrl; + } + } + + Executor executor = asyncExecutor != null ? asyncExecutor : ForkJoinPool.commonPool(); + FirecrawlHttpClient http = new FirecrawlHttpClient( + resolvedKey, resolvedUrl, timeoutMs, maxRetries, backoffFactor); + return new FirecrawlClient(http, executor); + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlHttpClient.java b/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlHttpClient.java new file mode 100644 index 0000000000..7fbc37f0d5 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlHttpClient.java @@ -0,0 +1,215 @@ +package com.firecrawl.client; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.datatype.jdk8.Jdk8Module; +import com.firecrawl.errors.AuthenticationException; +import com.firecrawl.errors.FirecrawlException; +import com.firecrawl.errors.RateLimitException; +import okhttp3.*; + +import java.io.IOException; +import java.util.Collections; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +/** + * Internal HTTP client for making authenticated requests to the Firecrawl API. + * Handles retry logic with exponential backoff. + */ +class FirecrawlHttpClient { + + private static final MediaType JSON = MediaType.get("application/json; charset=utf-8"); + + private final OkHttpClient httpClient; + private final String apiKey; + private final String baseUrl; + private final int maxRetries; + private final double backoffFactor; + final ObjectMapper objectMapper; + + FirecrawlHttpClient(String apiKey, String baseUrl, long timeoutMs, int maxRetries, double backoffFactor) { + this.apiKey = apiKey; + this.baseUrl = baseUrl.endsWith("/") ? baseUrl.substring(0, baseUrl.length() - 1) : baseUrl; + this.maxRetries = maxRetries; + this.backoffFactor = backoffFactor; + + this.httpClient = new OkHttpClient.Builder() + .connectTimeout(timeoutMs, TimeUnit.MILLISECONDS) + .readTimeout(timeoutMs, TimeUnit.MILLISECONDS) + .writeTimeout(timeoutMs, TimeUnit.MILLISECONDS) + .build(); + + this.objectMapper = new ObjectMapper() + .registerModule(new Jdk8Module()) + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + } + + /** + * Sends a POST request with JSON body. + */ + T post(String path, Object body, Class responseType) { + return post(path, body, responseType, Collections.emptyMap()); + } + + /** + * Sends a POST request with JSON body and extra headers. + */ + T post(String path, Object body, Class responseType, Map extraHeaders) { + String url = baseUrl + path; + String json; + try { + json = objectMapper.writeValueAsString(body); + } catch (JsonProcessingException e) { + throw new FirecrawlException("Failed to serialize request body", e); + } + RequestBody requestBody = RequestBody.create(json, JSON); + Request.Builder builder = new Request.Builder() + .url(url) + .header("Authorization", "Bearer " + apiKey) + .header("Content-Type", "application/json") + .post(requestBody); + for (Map.Entry entry : extraHeaders.entrySet()) { + builder.header(entry.getKey(), entry.getValue()); + } + Request request = builder.build(); + return executeWithRetry(request, responseType); + } + + /** + * Sends a GET request. + */ + T get(String path, Class responseType) { + String url = baseUrl + path; + Request request = new Request.Builder() + .url(url) + .header("Authorization", "Bearer " + apiKey) + .get() + .build(); + return executeWithRetry(request, responseType); + } + + /** + * Sends a GET request with full URL (for following next-page cursors). + */ + T getAbsolute(String absoluteUrl, Class responseType) { + Request request = new Request.Builder() + .url(absoluteUrl) + .header("Authorization", "Bearer " + apiKey) + .get() + .build(); + return executeWithRetry(request, responseType); + } + + /** + * Sends a DELETE request. + */ + T delete(String path, Class responseType) { + String url = baseUrl + path; + Request request = new Request.Builder() + .url(url) + .header("Authorization", "Bearer " + apiKey) + .delete() + .build(); + return executeWithRetry(request, responseType); + } + + /** + * Sends a raw GET request and returns the response body as a parsed Map. + */ + @SuppressWarnings("unchecked") + Map getRaw(String path) { + return get(path, Map.class); + } + + private T executeWithRetry(Request request, Class responseType) { + int attempt = 0; + while (true) { + try { + try (Response response = httpClient.newCall(request).execute()) { + ResponseBody responseBody = response.body(); + String bodyStr = responseBody != null ? responseBody.string() : ""; + + if (response.isSuccessful()) { + if (responseType == Void.class || responseType == void.class) { + return null; + } + return objectMapper.readValue(bodyStr, responseType); + } + + int code = response.code(); + + // Parse error details from response + String errorMessage = extractErrorMessage(bodyStr, code); + String errorCode = extractErrorCode(bodyStr); + + // Non-retryable client errors + if (code == 401) { + throw new AuthenticationException(errorMessage, errorCode, null); + } + if (code == 429) { + throw new RateLimitException(errorMessage, errorCode, null); + } + if (code >= 400 && code < 500 && code != 408 && code != 409) { + throw new FirecrawlException(errorMessage, code, errorCode, null); + } + + // Retryable errors: 408, 409, 502, 5xx + if (attempt < maxRetries) { + attempt++; + sleepWithBackoff(attempt); + continue; + } + + throw new FirecrawlException(errorMessage, code, errorCode, null); + } + } catch (FirecrawlException e) { + throw e; + } catch (IOException e) { + if (attempt < maxRetries) { + attempt++; + sleepWithBackoff(attempt); + continue; + } + throw new FirecrawlException("Request failed: " + e.getMessage(), e); + } + } + } + + @SuppressWarnings("unchecked") + private String extractErrorMessage(String body, int statusCode) { + try { + Map parsed = objectMapper.readValue(body, Map.class); + if (parsed.containsKey("error")) { + return String.valueOf(parsed.get("error")); + } + if (parsed.containsKey("message")) { + return String.valueOf(parsed.get("message")); + } + } catch (Exception ignored) { + } + return "HTTP " + statusCode + " error"; + } + + @SuppressWarnings("unchecked") + private String extractErrorCode(String body) { + try { + Map parsed = objectMapper.readValue(body, Map.class); + Object code = parsed.get("code"); + return code != null ? String.valueOf(code) : null; + } catch (Exception ignored) { + } + return null; + } + + private void sleepWithBackoff(int attempt) { + long delayMs = (long) (backoffFactor * 1000 * Math.pow(2, attempt - 1)); + try { + Thread.sleep(delayMs); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new FirecrawlException("Request interrupted during retry backoff", e); + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/errors/AuthenticationException.java b/apps/java-sdk/src/main/java/com/firecrawl/errors/AuthenticationException.java new file mode 100644 index 0000000000..c88cfe642d --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/errors/AuthenticationException.java @@ -0,0 +1,15 @@ +package com.firecrawl.errors; + +/** + * Thrown when the API returns a 401 Unauthorized response. + */ +public class AuthenticationException extends FirecrawlException { + + public AuthenticationException(String message) { + super(message, 401); + } + + public AuthenticationException(String message, String errorCode, Object details) { + super(message, 401, errorCode, details); + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/errors/FirecrawlException.java b/apps/java-sdk/src/main/java/com/firecrawl/errors/FirecrawlException.java new file mode 100644 index 0000000000..9c14d4051f --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/errors/FirecrawlException.java @@ -0,0 +1,42 @@ +package com.firecrawl.errors; + +/** + * Base exception for all Firecrawl SDK errors. + */ +public class FirecrawlException extends RuntimeException { + + private final int statusCode; + private final String errorCode; + private final Object details; + + public FirecrawlException(String message) { + this(message, 0, null, null); + } + + public FirecrawlException(String message, int statusCode) { + this(message, statusCode, null, null); + } + + public FirecrawlException(String message, int statusCode, String errorCode, Object details) { + super(message); + this.statusCode = statusCode; + this.errorCode = errorCode; + this.details = details; + } + + public FirecrawlException(String message, Throwable cause) { + super(message, cause); + this.statusCode = 0; + this.errorCode = null; + this.details = null; + } + + /** HTTP status code (0 if not an HTTP error). */ + public int getStatusCode() { return statusCode; } + + /** Error code from the API response, if any. */ + public String getErrorCode() { return errorCode; } + + /** Additional error details from the API response, if any. */ + public Object getDetails() { return details; } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/errors/JobTimeoutException.java b/apps/java-sdk/src/main/java/com/firecrawl/errors/JobTimeoutException.java new file mode 100644 index 0000000000..1e2f5955f6 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/errors/JobTimeoutException.java @@ -0,0 +1,22 @@ +package com.firecrawl.errors; + +/** + * Thrown when an async job (crawl, batch, agent) does not complete within the specified timeout. + */ +public class JobTimeoutException extends FirecrawlException { + + private final String jobId; + private final int timeoutSeconds; + + public JobTimeoutException(String jobId, int timeoutSeconds, String jobType) { + super(jobType + " job " + jobId + " did not complete within " + timeoutSeconds + " seconds"); + this.jobId = jobId; + this.timeoutSeconds = timeoutSeconds; + } + + /** The ID of the timed-out job. */ + public String getJobId() { return jobId; } + + /** The timeout in seconds that was exceeded. */ + public int getTimeoutSeconds() { return timeoutSeconds; } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/errors/RateLimitException.java b/apps/java-sdk/src/main/java/com/firecrawl/errors/RateLimitException.java new file mode 100644 index 0000000000..9a7271efe9 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/errors/RateLimitException.java @@ -0,0 +1,15 @@ +package com.firecrawl.errors; + +/** + * Thrown when the API returns a 429 Too Many Requests response. + */ +public class RateLimitException extends FirecrawlException { + + public RateLimitException(String message) { + super(message, 429); + } + + public RateLimitException(String message, String errorCode, Object details) { + super(message, 429, errorCode, details); + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/AgentOptions.java b/apps/java-sdk/src/main/java/com/firecrawl/models/AgentOptions.java new file mode 100644 index 0000000000..e0e25fd62a --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/AgentOptions.java @@ -0,0 +1,80 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonInclude; +import java.util.List; +import java.util.Map; + +/** + * Options for starting an agent task. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class AgentOptions { + + private List urls; + private String prompt; + private Map schema; + private String integration; + private Integer maxCredits; + private Boolean strictConstrainToURLs; + private String model; + private WebhookConfig webhook; + + private AgentOptions() {} + + public List getUrls() { return urls; } + public String getPrompt() { return prompt; } + public Map getSchema() { return schema; } + public String getIntegration() { return integration; } + public Integer getMaxCredits() { return maxCredits; } + public Boolean getStrictConstrainToURLs() { return strictConstrainToURLs; } + public String getModel() { return model; } + public WebhookConfig getWebhook() { return webhook; } + + public static Builder builder() { return new Builder(); } + + public static final class Builder { + private List urls; + private String prompt; + private Map schema; + private String integration; + private Integer maxCredits; + private Boolean strictConstrainToURLs; + private String model; + private WebhookConfig webhook; + + private Builder() {} + + /** Optional URLs to constrain the agent to. */ + public Builder urls(List urls) { this.urls = urls; return this; } + /** Natural language prompt describing what data to find. */ + public Builder prompt(String prompt) { this.prompt = prompt; return this; } + /** JSON Schema for structured output. */ + public Builder schema(Map schema) { this.schema = schema; return this; } + /** Integration identifier. */ + public Builder integration(String integration) { this.integration = integration; return this; } + /** Maximum credits to spend. */ + public Builder maxCredits(Integer maxCredits) { this.maxCredits = maxCredits; return this; } + /** Don't navigate outside provided URLs. */ + public Builder strictConstrainToURLs(Boolean strictConstrainToURLs) { this.strictConstrainToURLs = strictConstrainToURLs; return this; } + /** Agent model: "spark-1-pro" or "spark-1-mini". */ + public Builder model(String model) { this.model = model; return this; } + /** Webhook configuration. */ + public Builder webhook(WebhookConfig webhook) { this.webhook = webhook; return this; } + + public AgentOptions build() { + if (prompt == null || prompt.isEmpty()) { + throw new IllegalArgumentException("Agent prompt is required"); + } + AgentOptions o = new AgentOptions(); + o.urls = this.urls; + o.prompt = this.prompt; + o.schema = this.schema; + o.integration = this.integration; + o.maxCredits = this.maxCredits; + o.strictConstrainToURLs = this.strictConstrainToURLs; + o.model = this.model; + o.webhook = this.webhook; + return o; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/AgentResponse.java b/apps/java-sdk/src/main/java/com/firecrawl/models/AgentResponse.java new file mode 100644 index 0000000000..5f5581204e --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/AgentResponse.java @@ -0,0 +1,23 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +/** + * Response from starting an agent task. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class AgentResponse { + + private boolean success; + private String id; + private String error; + + public boolean isSuccess() { return success; } + public String getId() { return id; } + public String getError() { return error; } + + @Override + public String toString() { + return "AgentResponse{success=" + success + ", id=" + id + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/AgentStatusResponse.java b/apps/java-sdk/src/main/java/com/firecrawl/models/AgentStatusResponse.java new file mode 100644 index 0000000000..4a761013f5 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/AgentStatusResponse.java @@ -0,0 +1,35 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +/** + * Status response for an agent task. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class AgentStatusResponse { + + private boolean success; + private String status; + private String error; + private Object data; + private String model; + private String expiresAt; + private Integer creditsUsed; + + public boolean isSuccess() { return success; } + public String getStatus() { return status; } + public String getError() { return error; } + public Object getData() { return data; } + public String getModel() { return model; } + public String getExpiresAt() { return expiresAt; } + public Integer getCreditsUsed() { return creditsUsed; } + + public boolean isDone() { + return "completed".equals(status) || "failed".equals(status) || "cancelled".equals(status); + } + + @Override + public String toString() { + return "AgentStatusResponse{status=" + status + ", model=" + model + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/BatchScrapeJob.java b/apps/java-sdk/src/main/java/com/firecrawl/models/BatchScrapeJob.java new file mode 100644 index 0000000000..d6555b03f1 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/BatchScrapeJob.java @@ -0,0 +1,39 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import java.util.List; + +/** + * Status and results of a batch scrape job. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class BatchScrapeJob { + + private String id; + private String status; + private int completed; + private int total; + private Integer creditsUsed; + private String expiresAt; + private String next; + private List data; + + public String getId() { return id; } + public String getStatus() { return status; } + public int getCompleted() { return completed; } + public int getTotal() { return total; } + public Integer getCreditsUsed() { return creditsUsed; } + public String getExpiresAt() { return expiresAt; } + public String getNext() { return next; } + public List getData() { return data; } + public void setData(List data) { this.data = data; } + + public boolean isDone() { + return "completed".equals(status) || "failed".equals(status) || "cancelled".equals(status); + } + + @Override + public String toString() { + return "BatchScrapeJob{id=" + id + ", status=" + status + ", completed=" + completed + "/" + total + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/BatchScrapeOptions.java b/apps/java-sdk/src/main/java/com/firecrawl/models/BatchScrapeOptions.java new file mode 100644 index 0000000000..0119eebdc0 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/BatchScrapeOptions.java @@ -0,0 +1,78 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonInclude; + +/** + * Options for a batch scrape job. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class BatchScrapeOptions { + + private ScrapeOptions options; + private Object webhook; + private String appendToId; + private Boolean ignoreInvalidURLs; + private Integer maxConcurrency; + private Boolean zeroDataRetention; + @JsonIgnore + private String idempotencyKey; + private String integration; + + private BatchScrapeOptions() {} + + public ScrapeOptions getOptions() { return options; } + public Object getWebhook() { return webhook; } + public String getAppendToId() { return appendToId; } + public Boolean getIgnoreInvalidURLs() { return ignoreInvalidURLs; } + public Integer getMaxConcurrency() { return maxConcurrency; } + public Boolean getZeroDataRetention() { return zeroDataRetention; } + @JsonIgnore + public String getIdempotencyKey() { return idempotencyKey; } + public String getIntegration() { return integration; } + + public static Builder builder() { return new Builder(); } + + public static final class Builder { + private ScrapeOptions options; + private Object webhook; + private String appendToId; + private Boolean ignoreInvalidURLs; + private Integer maxConcurrency; + private Boolean zeroDataRetention; + private String idempotencyKey; + private String integration; + + private Builder() {} + + /** Scrape options applied to each URL. */ + public Builder options(ScrapeOptions options) { this.options = options; return this; } + /** Webhook URL string or {@link WebhookConfig} object. */ + public Builder webhook(Object webhook) { this.webhook = webhook; return this; } + /** Append URLs to an existing batch job. */ + public Builder appendToId(String appendToId) { this.appendToId = appendToId; return this; } + /** Ignore invalid URLs instead of failing. */ + public Builder ignoreInvalidURLs(Boolean ignoreInvalidURLs) { this.ignoreInvalidURLs = ignoreInvalidURLs; return this; } + /** Max concurrent scrapes. */ + public Builder maxConcurrency(Integer maxConcurrency) { this.maxConcurrency = maxConcurrency; return this; } + /** Do not store any data on Firecrawl servers. */ + public Builder zeroDataRetention(Boolean zeroDataRetention) { this.zeroDataRetention = zeroDataRetention; return this; } + /** Idempotency key to prevent duplicate batch jobs. */ + public Builder idempotencyKey(String idempotencyKey) { this.idempotencyKey = idempotencyKey; return this; } + /** Integration identifier. */ + public Builder integration(String integration) { this.integration = integration; return this; } + + public BatchScrapeOptions build() { + BatchScrapeOptions o = new BatchScrapeOptions(); + o.options = this.options; + o.webhook = this.webhook; + o.appendToId = this.appendToId; + o.ignoreInvalidURLs = this.ignoreInvalidURLs; + o.maxConcurrency = this.maxConcurrency; + o.zeroDataRetention = this.zeroDataRetention; + o.idempotencyKey = this.idempotencyKey; + o.integration = this.integration; + return o; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/BatchScrapeResponse.java b/apps/java-sdk/src/main/java/com/firecrawl/models/BatchScrapeResponse.java new file mode 100644 index 0000000000..1c1084ddc0 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/BatchScrapeResponse.java @@ -0,0 +1,24 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import java.util.List; + +/** + * Response from starting an async batch scrape job. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class BatchScrapeResponse { + + private String id; + private String url; + private List invalidURLs; + + public String getId() { return id; } + public String getUrl() { return url; } + public List getInvalidURLs() { return invalidURLs; } + + @Override + public String toString() { + return "BatchScrapeResponse{id=" + id + ", url=" + url + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserCreateResponse.java b/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserCreateResponse.java new file mode 100644 index 0000000000..99dff26141 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserCreateResponse.java @@ -0,0 +1,29 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +/** + * Response from creating a new browser session. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class BrowserCreateResponse { + + private boolean success; + private String id; + private String cdpUrl; + private String liveViewUrl; + private String expiresAt; + private String error; + + public boolean isSuccess() { return success; } + public String getId() { return id; } + public String getCdpUrl() { return cdpUrl; } + public String getLiveViewUrl() { return liveViewUrl; } + public String getExpiresAt() { return expiresAt; } + public String getError() { return error; } + + @Override + public String toString() { + return "BrowserCreateResponse{id=" + id + ", success=" + success + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserDeleteResponse.java b/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserDeleteResponse.java new file mode 100644 index 0000000000..1be1c458ac --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserDeleteResponse.java @@ -0,0 +1,25 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +/** + * Response from deleting a browser session. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class BrowserDeleteResponse { + + private boolean success; + private Long sessionDurationMs; + private Integer creditsBilled; + private String error; + + public boolean isSuccess() { return success; } + public Long getSessionDurationMs() { return sessionDurationMs; } + public Integer getCreditsBilled() { return creditsBilled; } + public String getError() { return error; } + + @Override + public String toString() { + return "BrowserDeleteResponse{success=" + success + ", creditsBilled=" + creditsBilled + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserExecuteResponse.java b/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserExecuteResponse.java new file mode 100644 index 0000000000..cab26e0bce --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserExecuteResponse.java @@ -0,0 +1,31 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +/** + * Response from executing code in a browser session. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class BrowserExecuteResponse { + + private boolean success; + private String stdout; + private String result; + private String stderr; + private Integer exitCode; + private Boolean killed; + private String error; + + public boolean isSuccess() { return success; } + public String getStdout() { return stdout; } + public String getResult() { return result; } + public String getStderr() { return stderr; } + public Integer getExitCode() { return exitCode; } + public Boolean getKilled() { return killed; } + public String getError() { return error; } + + @Override + public String toString() { + return "BrowserExecuteResponse{success=" + success + ", exitCode=" + exitCode + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserListResponse.java b/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserListResponse.java new file mode 100644 index 0000000000..9aba6a2e5b --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserListResponse.java @@ -0,0 +1,25 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import java.util.List; + +/** + * Response from listing browser sessions. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class BrowserListResponse { + + private boolean success; + private List sessions; + private String error; + + public boolean isSuccess() { return success; } + public List getSessions() { return sessions; } + public String getError() { return error; } + + @Override + public String toString() { + int count = sessions != null ? sessions.size() : 0; + return "BrowserListResponse{success=" + success + ", sessions=" + count + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserSession.java b/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserSession.java new file mode 100644 index 0000000000..2f4b4e5a4d --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/BrowserSession.java @@ -0,0 +1,31 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +/** + * Represents a browser session's metadata. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class BrowserSession { + + private String id; + private String status; + private String cdpUrl; + private String liveViewUrl; + private boolean streamWebView; + private String createdAt; + private String lastActivity; + + public String getId() { return id; } + public String getStatus() { return status; } + public String getCdpUrl() { return cdpUrl; } + public String getLiveViewUrl() { return liveViewUrl; } + public boolean isStreamWebView() { return streamWebView; } + public String getCreatedAt() { return createdAt; } + public String getLastActivity() { return lastActivity; } + + @Override + public String toString() { + return "BrowserSession{id=" + id + ", status=" + status + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/ConcurrencyCheck.java b/apps/java-sdk/src/main/java/com/firecrawl/models/ConcurrencyCheck.java new file mode 100644 index 0000000000..55fc65935e --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/ConcurrencyCheck.java @@ -0,0 +1,21 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +/** + * Current concurrency usage. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class ConcurrencyCheck { + + private int concurrency; + private int maxConcurrency; + + public int getConcurrency() { return concurrency; } + public int getMaxConcurrency() { return maxConcurrency; } + + @Override + public String toString() { + return "ConcurrencyCheck{concurrency=" + concurrency + "/" + maxConcurrency + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/CrawlJob.java b/apps/java-sdk/src/main/java/com/firecrawl/models/CrawlJob.java new file mode 100644 index 0000000000..3efbec882e --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/CrawlJob.java @@ -0,0 +1,40 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import java.util.List; + +/** + * Status and results of a crawl job. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class CrawlJob { + + private String id; + private String status; + private int total; + private int completed; + private Integer creditsUsed; + private String expiresAt; + private String next; + private List data; + + public String getId() { return id; } + public String getStatus() { return status; } + public int getTotal() { return total; } + public int getCompleted() { return completed; } + public Integer getCreditsUsed() { return creditsUsed; } + public String getExpiresAt() { return expiresAt; } + public String getNext() { return next; } + public List getData() { return data; } + public void setData(List data) { this.data = data; } + + /** Returns true if the job has finished (completed, failed, or cancelled). */ + public boolean isDone() { + return "completed".equals(status) || "failed".equals(status) || "cancelled".equals(status); + } + + @Override + public String toString() { + return "CrawlJob{id=" + id + ", status=" + status + ", completed=" + completed + "/" + total + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/CrawlOptions.java b/apps/java-sdk/src/main/java/com/firecrawl/models/CrawlOptions.java new file mode 100644 index 0000000000..68bcb6ca75 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/CrawlOptions.java @@ -0,0 +1,154 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonInclude; +import java.util.List; +import java.util.Map; + +/** + * Options for crawling a website. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class CrawlOptions { + + private String prompt; + private List excludePaths; + private List includePaths; + private Integer maxDiscoveryDepth; + private String sitemap; + private Boolean ignoreQueryParameters; + private Boolean deduplicateSimilarURLs; + private Integer limit; + private Boolean crawlEntireDomain; + private Boolean allowExternalLinks; + private Boolean allowSubdomains; + private Integer delay; + private Integer maxConcurrency; + private Object webhook; + private ScrapeOptions scrapeOptions; + private Boolean regexOnFullURL; + private Boolean zeroDataRetention; + private String integration; + + private CrawlOptions() {} + + public String getPrompt() { return prompt; } + public List getExcludePaths() { return excludePaths; } + public List getIncludePaths() { return includePaths; } + public Integer getMaxDiscoveryDepth() { return maxDiscoveryDepth; } + public String getSitemap() { return sitemap; } + public Boolean getIgnoreQueryParameters() { return ignoreQueryParameters; } + public Boolean getDeduplicateSimilarURLs() { return deduplicateSimilarURLs; } + public Integer getLimit() { return limit; } + public Boolean getCrawlEntireDomain() { return crawlEntireDomain; } + public Boolean getAllowExternalLinks() { return allowExternalLinks; } + public Boolean getAllowSubdomains() { return allowSubdomains; } + public Integer getDelay() { return delay; } + public Integer getMaxConcurrency() { return maxConcurrency; } + public Object getWebhook() { return webhook; } + public ScrapeOptions getScrapeOptions() { return scrapeOptions; } + public Boolean getRegexOnFullURL() { return regexOnFullURL; } + public Boolean getZeroDataRetention() { return zeroDataRetention; } + public String getIntegration() { return integration; } + + public static Builder builder() { return new Builder(); } + + public static final class Builder { + private String prompt; + private List excludePaths; + private List includePaths; + private Integer maxDiscoveryDepth; + private String sitemap; + private Boolean ignoreQueryParameters; + private Boolean deduplicateSimilarURLs; + private Integer limit; + private Boolean crawlEntireDomain; + private Boolean allowExternalLinks; + private Boolean allowSubdomains; + private Integer delay; + private Integer maxConcurrency; + private Object webhook; + private ScrapeOptions scrapeOptions; + private Boolean regexOnFullURL; + private Boolean zeroDataRetention; + private String integration; + + private Builder() {} + + /** Natural language prompt to guide crawling. */ + public Builder prompt(String prompt) { this.prompt = prompt; return this; } + + /** URL path patterns to exclude from crawling. */ + public Builder excludePaths(List excludePaths) { this.excludePaths = excludePaths; return this; } + + /** URL path patterns to include in crawling. */ + public Builder includePaths(List includePaths) { this.includePaths = includePaths; return this; } + + /** Maximum depth to discover links. */ + public Builder maxDiscoveryDepth(Integer maxDiscoveryDepth) { this.maxDiscoveryDepth = maxDiscoveryDepth; return this; } + + /** Sitemap handling: "skip", "include", or "only". */ + public Builder sitemap(String sitemap) { this.sitemap = sitemap; return this; } + + /** Ignore query parameters when deduplicating URLs. */ + public Builder ignoreQueryParameters(Boolean ignoreQueryParameters) { this.ignoreQueryParameters = ignoreQueryParameters; return this; } + + /** Deduplicate URLs that are similar. */ + public Builder deduplicateSimilarURLs(Boolean deduplicateSimilarURLs) { this.deduplicateSimilarURLs = deduplicateSimilarURLs; return this; } + + /** Maximum number of pages to crawl. */ + public Builder limit(Integer limit) { this.limit = limit; return this; } + + /** Whether to crawl the entire domain. */ + public Builder crawlEntireDomain(Boolean crawlEntireDomain) { this.crawlEntireDomain = crawlEntireDomain; return this; } + + /** Follow external links. */ + public Builder allowExternalLinks(Boolean allowExternalLinks) { this.allowExternalLinks = allowExternalLinks; return this; } + + /** Follow subdomains. */ + public Builder allowSubdomains(Boolean allowSubdomains) { this.allowSubdomains = allowSubdomains; return this; } + + /** Delay in milliseconds between requests. */ + public Builder delay(Integer delay) { this.delay = delay; return this; } + + /** Maximum concurrent requests. */ + public Builder maxConcurrency(Integer maxConcurrency) { this.maxConcurrency = maxConcurrency; return this; } + + /** Webhook URL string or {@link WebhookConfig} object. */ + public Builder webhook(Object webhook) { this.webhook = webhook; return this; } + + /** Scrape options applied to each crawled page. */ + public Builder scrapeOptions(ScrapeOptions scrapeOptions) { this.scrapeOptions = scrapeOptions; return this; } + + /** Apply regex patterns to the full URL, not just the path. */ + public Builder regexOnFullURL(Boolean regexOnFullURL) { this.regexOnFullURL = regexOnFullURL; return this; } + + /** Do not store any scraped data on Firecrawl servers. */ + public Builder zeroDataRetention(Boolean zeroDataRetention) { this.zeroDataRetention = zeroDataRetention; return this; } + + /** Integration identifier. */ + public Builder integration(String integration) { this.integration = integration; return this; } + + public CrawlOptions build() { + CrawlOptions o = new CrawlOptions(); + o.prompt = this.prompt; + o.excludePaths = this.excludePaths; + o.includePaths = this.includePaths; + o.maxDiscoveryDepth = this.maxDiscoveryDepth; + o.sitemap = this.sitemap; + o.ignoreQueryParameters = this.ignoreQueryParameters; + o.deduplicateSimilarURLs = this.deduplicateSimilarURLs; + o.limit = this.limit; + o.crawlEntireDomain = this.crawlEntireDomain; + o.allowExternalLinks = this.allowExternalLinks; + o.allowSubdomains = this.allowSubdomains; + o.delay = this.delay; + o.maxConcurrency = this.maxConcurrency; + o.webhook = this.webhook; + o.scrapeOptions = this.scrapeOptions; + o.regexOnFullURL = this.regexOnFullURL; + o.zeroDataRetention = this.zeroDataRetention; + o.integration = this.integration; + return o; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/CrawlResponse.java b/apps/java-sdk/src/main/java/com/firecrawl/models/CrawlResponse.java new file mode 100644 index 0000000000..4ea20c732d --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/CrawlResponse.java @@ -0,0 +1,21 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +/** + * Response from starting an async crawl job. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class CrawlResponse { + + private String id; + private String url; + + public String getId() { return id; } + public String getUrl() { return url; } + + @Override + public String toString() { + return "CrawlResponse{id=" + id + ", url=" + url + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/CreditUsage.java b/apps/java-sdk/src/main/java/com/firecrawl/models/CreditUsage.java new file mode 100644 index 0000000000..554c15ab39 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/CreditUsage.java @@ -0,0 +1,25 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +/** + * Current credit usage information. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class CreditUsage { + + private int remainingCredits; + private Integer planCredits; + private String billingPeriodStart; + private String billingPeriodEnd; + + public int getRemainingCredits() { return remainingCredits; } + public Integer getPlanCredits() { return planCredits; } + public String getBillingPeriodStart() { return billingPeriodStart; } + public String getBillingPeriodEnd() { return billingPeriodEnd; } + + @Override + public String toString() { + return "CreditUsage{remaining=" + remainingCredits + ", plan=" + planCredits + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/Document.java b/apps/java-sdk/src/main/java/com/firecrawl/models/Document.java new file mode 100644 index 0000000000..beb41fb0fd --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/Document.java @@ -0,0 +1,49 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import java.util.List; +import java.util.Map; + +/** + * A scraped document returned by scrape, crawl, and batch endpoints. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class Document { + + private String markdown; + private String html; + private String rawHtml; + private Object json; + private String summary; + private Map metadata; + private List links; + private List images; + private String screenshot; + private List> attributes; + private Map actions; + private String warning; + private Map changeTracking; + private Map branding; + + public String getMarkdown() { return markdown; } + public String getHtml() { return html; } + public String getRawHtml() { return rawHtml; } + public Object getJson() { return json; } + public String getSummary() { return summary; } + public Map getMetadata() { return metadata; } + public List getLinks() { return links; } + public List getImages() { return images; } + public String getScreenshot() { return screenshot; } + public List> getAttributes() { return attributes; } + public Map getActions() { return actions; } + public String getWarning() { return warning; } + public Map getChangeTracking() { return changeTracking; } + public Map getBranding() { return branding; } + + @Override + public String toString() { + String title = metadata != null ? String.valueOf(metadata.get("title")) : "untitled"; + String url = metadata != null ? String.valueOf(metadata.get("sourceURL")) : "unknown"; + return "Document{title=" + title + ", url=" + url + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/JsonFormat.java b/apps/java-sdk/src/main/java/com/firecrawl/models/JsonFormat.java new file mode 100644 index 0000000000..1d53485ebf --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/JsonFormat.java @@ -0,0 +1,57 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonInclude; +import java.util.Map; + +/** + * JSON extraction format with optional schema and prompt. + * + *

Usage: + *

{@code
+ * JsonFormat jsonFmt = JsonFormat.builder()
+ *     .prompt("Extract the product name and price")
+ *     .schema(Map.of(
+ *         "type", "object",
+ *         "properties", Map.of(
+ *             "name", Map.of("type", "string"),
+ *             "price", Map.of("type", "number")
+ *         )
+ *     ))
+ *     .build();
+ * }
+ */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class JsonFormat { + + private final String type = "json"; + private String prompt; + private Map schema; + + private JsonFormat() {} + + public String getType() { return type; } + public String getPrompt() { return prompt; } + public Map getSchema() { return schema; } + + public static Builder builder() { return new Builder(); } + + public static final class Builder { + private String prompt; + private Map schema; + + private Builder() {} + + /** LLM prompt for extraction. */ + public Builder prompt(String prompt) { this.prompt = prompt; return this; } + + /** JSON Schema for structured extraction. */ + public Builder schema(Map schema) { this.schema = schema; return this; } + + public JsonFormat build() { + JsonFormat f = new JsonFormat(); + f.prompt = this.prompt; + f.schema = this.schema; + return f; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/LocationConfig.java b/apps/java-sdk/src/main/java/com/firecrawl/models/LocationConfig.java new file mode 100644 index 0000000000..6e0705dd4f --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/LocationConfig.java @@ -0,0 +1,38 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonInclude; +import java.util.List; + +/** + * Geolocation configuration for requests. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class LocationConfig { + + private String country; + private List languages; + + private LocationConfig() {} + + public String getCountry() { return country; } + public List getLanguages() { return languages; } + + public static Builder builder() { return new Builder(); } + + public static final class Builder { + private String country; + private List languages; + + private Builder() {} + + public Builder country(String country) { this.country = country; return this; } + public Builder languages(List languages) { this.languages = languages; return this; } + + public LocationConfig build() { + LocationConfig c = new LocationConfig(); + c.country = this.country; + c.languages = this.languages; + return c; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/MapData.java b/apps/java-sdk/src/main/java/com/firecrawl/models/MapData.java new file mode 100644 index 0000000000..b9908352a6 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/MapData.java @@ -0,0 +1,53 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +/** + * Result of a map operation containing discovered URLs. + * + *

The v2 API may return {@code links} as either plain URL strings or + * objects with {@code url}, {@code title}, and {@code description} fields. + * This class normalises both representations into a uniform + * {@code List>} where each entry always contains at + * least a {@code "url"} key. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class MapData { + + private List links; + + /** + * Returns the discovered links, normalised so that every entry is a + * {@code Map} containing at least a {@code "url"} key. + * Plain-string entries returned by the API are wrapped as + * {@code {"url": ""}}. + */ + @SuppressWarnings("unchecked") + public List> getLinks() { + if (links == null) { + return null; + } + List> result = new ArrayList<>(links.size()); + for (Object item : links) { + if (item instanceof Map) { + result.add((Map) item); + } else if (item instanceof String) { + Map wrapped = new LinkedHashMap<>(); + wrapped.put("url", item); + result.add(wrapped); + } + } + return Collections.unmodifiableList(result); + } + + @Override + public String toString() { + int count = links != null ? links.size() : 0; + return "MapData{links=" + count + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/MapOptions.java b/apps/java-sdk/src/main/java/com/firecrawl/models/MapOptions.java new file mode 100644 index 0000000000..06940939dd --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/MapOptions.java @@ -0,0 +1,76 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonInclude; + +/** + * Options for mapping (discovering URLs on) a website. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class MapOptions { + + private String search; + private String sitemap; + private Boolean includeSubdomains; + private Boolean ignoreQueryParameters; + private Integer limit; + private Integer timeout; + private String integration; + private LocationConfig location; + + private MapOptions() {} + + public String getSearch() { return search; } + /** Sitemap mode: "only", "include", or "skip". */ + public String getSitemap() { return sitemap; } + public Boolean getIncludeSubdomains() { return includeSubdomains; } + public Boolean getIgnoreQueryParameters() { return ignoreQueryParameters; } + public Integer getLimit() { return limit; } + public Integer getTimeout() { return timeout; } + public String getIntegration() { return integration; } + public LocationConfig getLocation() { return location; } + + public static Builder builder() { return new Builder(); } + + public static final class Builder { + private String search; + private String sitemap; + private Boolean includeSubdomains; + private Boolean ignoreQueryParameters; + private Integer limit; + private Integer timeout; + private String integration; + private LocationConfig location; + + private Builder() {} + + /** Filter discovered URLs by keyword. */ + public Builder search(String search) { this.search = search; return this; } + /** Sitemap mode: "only", "include", or "skip". */ + public Builder sitemap(String sitemap) { this.sitemap = sitemap; return this; } + /** Include subdomains. */ + public Builder includeSubdomains(Boolean includeSubdomains) { this.includeSubdomains = includeSubdomains; return this; } + /** Ignore query parameters when deduplicating URLs. */ + public Builder ignoreQueryParameters(Boolean ignoreQueryParameters) { this.ignoreQueryParameters = ignoreQueryParameters; return this; } + /** Maximum number of URLs to return. */ + public Builder limit(Integer limit) { this.limit = limit; return this; } + /** Timeout in milliseconds. */ + public Builder timeout(Integer timeout) { this.timeout = timeout; return this; } + /** Integration identifier. */ + public Builder integration(String integration) { this.integration = integration; return this; } + /** Geolocation configuration. */ + public Builder location(LocationConfig location) { this.location = location; return this; } + + public MapOptions build() { + MapOptions o = new MapOptions(); + o.search = this.search; + o.sitemap = this.sitemap; + o.includeSubdomains = this.includeSubdomains; + o.ignoreQueryParameters = this.ignoreQueryParameters; + o.limit = this.limit; + o.timeout = this.timeout; + o.integration = this.integration; + o.location = this.location; + return o; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/ScrapeOptions.java b/apps/java-sdk/src/main/java/com/firecrawl/models/ScrapeOptions.java new file mode 100644 index 0000000000..b7ab815d24 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/ScrapeOptions.java @@ -0,0 +1,186 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Options for scraping a single URL. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class ScrapeOptions { + + private List formats; + private Map headers; + private List includeTags; + private List excludeTags; + private Boolean onlyMainContent; + private Integer timeout; + private Integer waitFor; + private Boolean mobile; + private List parsers; + private List> actions; + private LocationConfig location; + private Boolean skipTlsVerification; + private Boolean removeBase64Images; + private Boolean blockAds; + private String proxy; + @JsonProperty("maxAge") + private Long maxAge; + private Boolean storeInCache; + private String integration; + + private ScrapeOptions() {} + + public List getFormats() { return formats; } + public Map getHeaders() { return headers; } + public List getIncludeTags() { return includeTags; } + public List getExcludeTags() { return excludeTags; } + public Boolean getOnlyMainContent() { return onlyMainContent; } + public Integer getTimeout() { return timeout; } + public Integer getWaitFor() { return waitFor; } + public Boolean getMobile() { return mobile; } + public List getParsers() { return parsers; } + public List> getActions() { return actions; } + public LocationConfig getLocation() { return location; } + public Boolean getSkipTlsVerification() { return skipTlsVerification; } + public Boolean getRemoveBase64Images() { return removeBase64Images; } + public Boolean getBlockAds() { return blockAds; } + public String getProxy() { return proxy; } + public Long getMaxAge() { return maxAge; } + public Boolean getStoreInCache() { return storeInCache; } + public String getIntegration() { return integration; } + + public static Builder builder() { return new Builder(); } + + public Builder toBuilder() { + Builder b = new Builder(); + b.formats = this.formats != null ? new ArrayList<>(this.formats) : null; + b.headers = this.headers != null ? new HashMap<>(this.headers) : null; + b.includeTags = this.includeTags != null ? new ArrayList<>(this.includeTags) : null; + b.excludeTags = this.excludeTags != null ? new ArrayList<>(this.excludeTags) : null; + b.onlyMainContent = this.onlyMainContent; + b.timeout = this.timeout; + b.waitFor = this.waitFor; + b.mobile = this.mobile; + b.parsers = this.parsers != null ? new ArrayList<>(this.parsers) : null; + b.actions = this.actions != null ? new ArrayList<>(this.actions) : null; + b.location = this.location; + b.skipTlsVerification = this.skipTlsVerification; + b.removeBase64Images = this.removeBase64Images; + b.blockAds = this.blockAds; + b.proxy = this.proxy; + b.maxAge = this.maxAge; + b.storeInCache = this.storeInCache; + b.integration = this.integration; + return b; + } + + public static final class Builder { + private List formats; + private Map headers; + private List includeTags; + private List excludeTags; + private Boolean onlyMainContent; + private Integer timeout; + private Integer waitFor; + private Boolean mobile; + private List parsers; + private List> actions; + private LocationConfig location; + private Boolean skipTlsVerification; + private Boolean removeBase64Images; + private Boolean blockAds; + private String proxy; + private Long maxAge; + private Boolean storeInCache; + private String integration; + + private Builder() {} + + /** + * Output formats to request. Accepts strings like "markdown", "html", "rawHtml", + * "links", "screenshot", "json", etc., or format configuration maps for advanced + * formats (e.g., JsonFormat, ScreenshotFormat). + */ + public Builder formats(List formats) { this.formats = formats; return this; } + + /** Custom HTTP headers to send with the request. */ + public Builder headers(Map headers) { this.headers = headers; return this; } + + /** Only include content from these HTML tags. */ + public Builder includeTags(List includeTags) { this.includeTags = includeTags; return this; } + + /** Exclude content from these HTML tags. */ + public Builder excludeTags(List excludeTags) { this.excludeTags = excludeTags; return this; } + + /** Only return the main content of the page, excluding navbars/footers. */ + public Builder onlyMainContent(Boolean onlyMainContent) { this.onlyMainContent = onlyMainContent; return this; } + + /** Timeout in milliseconds for the scrape request. */ + public Builder timeout(Integer timeout) { this.timeout = timeout; return this; } + + /** Wait time in milliseconds before scraping (for JS rendering). */ + public Builder waitFor(Integer waitFor) { this.waitFor = waitFor; return this; } + + /** Scrape as a mobile device. */ + public Builder mobile(Boolean mobile) { this.mobile = mobile; return this; } + + /** Parsers to use (e.g., "pdf" or {"type": "pdf", "maxPages": 10}). */ + public Builder parsers(List parsers) { this.parsers = parsers; return this; } + + /** Actions to execute before/during scraping. */ + public Builder actions(List> actions) { this.actions = actions; return this; } + + /** Geolocation configuration. */ + public Builder location(LocationConfig location) { this.location = location; return this; } + + /** Skip TLS certificate verification. */ + public Builder skipTlsVerification(Boolean skipTlsVerification) { this.skipTlsVerification = skipTlsVerification; return this; } + + /** Remove base64-encoded images from the response. */ + public Builder removeBase64Images(Boolean removeBase64Images) { this.removeBase64Images = removeBase64Images; return this; } + + /** Block advertisements during scraping. */ + public Builder blockAds(Boolean blockAds) { this.blockAds = blockAds; return this; } + + /** Proxy mode: "basic", "stealth", "enhanced", "auto", or a custom proxy URL. */ + public Builder proxy(String proxy) { this.proxy = proxy; return this; } + + /** Use cached result if younger than this many milliseconds. */ + public Builder maxAge(Long maxAge) { this.maxAge = maxAge; return this; } + + /** Whether to cache the result. */ + public Builder storeInCache(Boolean storeInCache) { this.storeInCache = storeInCache; return this; } + + /** Integration identifier. */ + public Builder integration(String integration) { this.integration = integration; return this; } + + public ScrapeOptions build() { + ScrapeOptions o = new ScrapeOptions(); + o.formats = this.formats != null ? Collections.unmodifiableList(new ArrayList<>(this.formats)) : null; + o.headers = this.headers != null ? Collections.unmodifiableMap(new HashMap<>(this.headers)) : null; + o.includeTags = this.includeTags != null ? Collections.unmodifiableList(new ArrayList<>(this.includeTags)) : null; + o.excludeTags = this.excludeTags != null ? Collections.unmodifiableList(new ArrayList<>(this.excludeTags)) : null; + o.onlyMainContent = this.onlyMainContent; + o.timeout = this.timeout; + o.waitFor = this.waitFor; + o.mobile = this.mobile; + o.parsers = this.parsers != null ? Collections.unmodifiableList(new ArrayList<>(this.parsers)) : null; + o.actions = this.actions != null ? Collections.unmodifiableList(new ArrayList<>(this.actions)) : null; + o.location = this.location; + o.skipTlsVerification = this.skipTlsVerification; + o.removeBase64Images = this.removeBase64Images; + o.blockAds = this.blockAds; + o.proxy = this.proxy; + o.maxAge = this.maxAge; + o.storeInCache = this.storeInCache; + o.integration = this.integration; + return o; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/SearchData.java b/apps/java-sdk/src/main/java/com/firecrawl/models/SearchData.java new file mode 100644 index 0000000000..cc65cb4680 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/SearchData.java @@ -0,0 +1,37 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import java.util.List; +import java.util.Map; + +/** + * Search results from the v2 search API. + * The API returns an object with web, news, and images arrays. + */ +@JsonIgnoreProperties(ignoreUnknown = true) +public class SearchData { + + private List> web; + private List> news; + private List> images; + + /** Web search results. */ + public List> getWeb() { return web; } + public void setWeb(List> web) { this.web = web; } + + /** News search results. */ + public List> getNews() { return news; } + public void setNews(List> news) { this.news = news; } + + /** Image search results. */ + public List> getImages() { return images; } + public void setImages(List> images) { this.images = images; } + + @Override + public String toString() { + int webCount = web != null ? web.size() : 0; + int newsCount = news != null ? news.size() : 0; + int imageCount = images != null ? images.size() : 0; + return "SearchData{web=" + webCount + ", news=" + newsCount + ", images=" + imageCount + "}"; + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/SearchOptions.java b/apps/java-sdk/src/main/java/com/firecrawl/models/SearchOptions.java new file mode 100644 index 0000000000..c58447f532 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/SearchOptions.java @@ -0,0 +1,82 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonInclude; +import java.util.List; + +/** + * Options for a web search request. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class SearchOptions { + + private List sources; + private List categories; + private Integer limit; + private String tbs; + private String location; + private Boolean ignoreInvalidURLs; + private Integer timeout; + private ScrapeOptions scrapeOptions; + private String integration; + + private SearchOptions() {} + + public List getSources() { return sources; } + public List getCategories() { return categories; } + public Integer getLimit() { return limit; } + public String getTbs() { return tbs; } + public String getLocation() { return location; } + public Boolean getIgnoreInvalidURLs() { return ignoreInvalidURLs; } + public Integer getTimeout() { return timeout; } + public ScrapeOptions getScrapeOptions() { return scrapeOptions; } + public String getIntegration() { return integration; } + + public static Builder builder() { return new Builder(); } + + public static final class Builder { + private List sources; + private List categories; + private Integer limit; + private String tbs; + private String location; + private Boolean ignoreInvalidURLs; + private Integer timeout; + private ScrapeOptions scrapeOptions; + private String integration; + + private Builder() {} + + /** Source types: "web", "news", "images" as strings or {type: "web"} maps. */ + public Builder sources(List sources) { this.sources = sources; return this; } + /** Categories: "github", "research", "pdf". */ + public Builder categories(List categories) { this.categories = categories; return this; } + /** Maximum number of results. */ + public Builder limit(Integer limit) { this.limit = limit; return this; } + /** Time-based search filter (e.g., "qdr:d" for past day, "qdr:w" for past week). */ + public Builder tbs(String tbs) { this.tbs = tbs; return this; } + /** Location for search results (e.g., "US"). */ + public Builder location(String location) { this.location = location; return this; } + /** Ignore invalid URLs in results. */ + public Builder ignoreInvalidURLs(Boolean ignoreInvalidURLs) { this.ignoreInvalidURLs = ignoreInvalidURLs; return this; } + /** Timeout in milliseconds. */ + public Builder timeout(Integer timeout) { this.timeout = timeout; return this; } + /** Scrape options applied to search result pages. */ + public Builder scrapeOptions(ScrapeOptions scrapeOptions) { this.scrapeOptions = scrapeOptions; return this; } + /** Integration identifier. */ + public Builder integration(String integration) { this.integration = integration; return this; } + + public SearchOptions build() { + SearchOptions o = new SearchOptions(); + o.sources = this.sources; + o.categories = this.categories; + o.limit = this.limit; + o.tbs = this.tbs; + o.location = this.location; + o.ignoreInvalidURLs = this.ignoreInvalidURLs; + o.timeout = this.timeout; + o.scrapeOptions = this.scrapeOptions; + o.integration = this.integration; + return o; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/WebhookConfig.java b/apps/java-sdk/src/main/java/com/firecrawl/models/WebhookConfig.java new file mode 100644 index 0000000000..96d0b3fd18 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/WebhookConfig.java @@ -0,0 +1,57 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonInclude; +import java.util.List; +import java.util.Map; + +/** + * Webhook configuration for async jobs. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class WebhookConfig { + + private String url; + private Map headers; + private Map metadata; + private List events; + + private WebhookConfig() {} + + public String getUrl() { return url; } + public Map getHeaders() { return headers; } + public Map getMetadata() { return metadata; } + public List getEvents() { return events; } + + public static Builder builder() { return new Builder(); } + + public static final class Builder { + private String url; + private Map headers; + private Map metadata; + private List events; + + private Builder() {} + + public Builder url(String url) { this.url = url; return this; } + public Builder headers(Map headers) { this.headers = headers; return this; } + public Builder metadata(Map metadata) { this.metadata = metadata; return this; } + + /** + * Events to subscribe to. Crawl/batch events: "completed", "failed", "page", "started". + * Agent events: "started", "action", "completed", "failed", "cancelled". + */ + public Builder events(List events) { this.events = events; return this; } + + public WebhookConfig build() { + if (url == null || url.isEmpty()) { + throw new IllegalArgumentException("Webhook URL is required"); + } + WebhookConfig c = new WebhookConfig(); + c.url = this.url; + c.headers = this.headers; + c.metadata = this.metadata; + c.events = this.events; + return c; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/package-info.java b/apps/java-sdk/src/main/java/com/firecrawl/package-info.java new file mode 100644 index 0000000000..415cf33c47 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/package-info.java @@ -0,0 +1,23 @@ +/** + * Firecrawl Java SDK — a type-safe client for the Firecrawl v2 web scraping API. + * + *

Quick start: + *

{@code
+ * import com.firecrawl.client.FirecrawlClient;
+ * import com.firecrawl.models.*;
+ *
+ * FirecrawlClient client = FirecrawlClient.builder()
+ *     .apiKey("fc-your-api-key")
+ *     .build();
+ *
+ * Document doc = client.scrape("https://example.com",
+ *     ScrapeOptions.builder()
+ *         .formats(List.of("markdown"))
+ *         .build());
+ *
+ * System.out.println(doc.getMarkdown());
+ * }
+ * + * @see com.firecrawl.client.FirecrawlClient + */ +package com.firecrawl; diff --git a/apps/java-sdk/src/test/java/com/firecrawl/AgentTest.java b/apps/java-sdk/src/test/java/com/firecrawl/AgentTest.java new file mode 100644 index 0000000000..8cae94f131 --- /dev/null +++ b/apps/java-sdk/src/test/java/com/firecrawl/AgentTest.java @@ -0,0 +1,285 @@ +package com.firecrawl; + +import com.firecrawl.client.FirecrawlClient; +import com.firecrawl.models.AgentOptions; +import com.firecrawl.models.AgentResponse; +import com.firecrawl.models.AgentStatusResponse; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; + +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Comprehensive Agent Tests + * + * Tests the AI agent functionality with various configurations. + * Based on Node.js SDK patterns and tested against live firecrawl.dev. + * + * Run with: FIRECRAWL_API_KEY=fc-xxx gradle test --tests "com.firecrawl.AgentTest" + */ +class AgentTest { + + private static FirecrawlClient client; + + @BeforeAll + static void setup() { + String apiKey = System.getenv("FIRECRAWL_API_KEY"); + if (apiKey != null && !apiKey.isBlank()) { + client = FirecrawlClient.fromEnv(); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testAgentWithPrompt() { + System.out.println("\n=== Test: Agent with Prompt ==="); + + AgentStatusResponse result = client.agent( + AgentOptions.builder() + .prompt("Find information about Firecrawl's main features and pricing") + .build()); + + assertNotNull(result, "Agent result should not be null"); + assertNotNull(result.getStatus(), "Status should not be null"); + assertTrue(List.of("completed", "failed").contains(result.getStatus()), + "Status should be completed or failed: " + result.getStatus()); + + System.out.println("✓ Agent task completed"); + System.out.println(" Status: " + result.getStatus()); + if (result.getData() != null) { + System.out.println(" Data returned: ✓"); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testAgentWithURLs() { + System.out.println("\n=== Test: Agent with Specific URLs ==="); + + AgentStatusResponse result = client.agent( + AgentOptions.builder() + .urls(List.of("https://firecrawl.dev", "https://docs.firecrawl.dev")) + .prompt("What are the main features of Firecrawl?") + .build()); + + assertNotNull(result, "Agent result should not be null"); + assertTrue(List.of("completed", "failed").contains(result.getStatus()), + "Status should be completed or failed"); + + System.out.println("✓ Agent with URLs completed"); + System.out.println(" URLs provided: 2"); + System.out.println(" Status: " + result.getStatus()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testAgentWithSchema() { + System.out.println("\n=== Test: Agent with Schema ==="); + + Map schema = Map.of( + "type", "object", + "properties", Map.of( + "features", Map.of( + "type", "array", + "items", Map.of("type", "string") + ), + "pricing", Map.of( + "type", "object", + "properties", Map.of( + "plans", Map.of("type", "array") + ) + ) + ), + "required", List.of("features") + ); + + AgentStatusResponse result = client.agent( + AgentOptions.builder() + .urls(List.of("https://firecrawl.dev")) + .prompt("Extract features and pricing information") + .schema(schema) + .build()); + + assertNotNull(result, "Agent result should not be null"); + assertTrue(List.of("completed", "failed").contains(result.getStatus()), + "Status should be completed or failed"); + + System.out.println("✓ Agent with schema completed"); + System.out.println(" Schema provided: ✓"); + System.out.println(" Status: " + result.getStatus()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testStartAgent() { + System.out.println("\n=== Test: Start Agent (Async) ==="); + + AgentResponse response = client.startAgent( + AgentOptions.builder() + .prompt("Research Firecrawl features") + .build()); + + assertNotNull(response, "Agent response should not be null"); + assertNotNull(response.getId(), "Agent ID should not be null"); + assertTrue(response.isSuccess(), "Response should be successful"); + + System.out.println("✓ Agent started successfully"); + System.out.println(" Job ID: " + response.getId()); + System.out.println(" Success: " + response.isSuccess()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testAgentStatusCheck() { + System.out.println("\n=== Test: Check Agent Status ==="); + + // Start an agent + AgentResponse start = client.startAgent( + AgentOptions.builder() + .prompt("Find information about web scraping") + .build()); + + // Check status + AgentStatusResponse status = client.getAgentStatus(start.getId()); + + assertNotNull(status, "Status should not be null"); + assertNotNull(status.getStatus(), "Status field should not be null"); + assertTrue(List.of("scraping", "completed", "failed", "cancelled").contains(status.getStatus()), + "Status should be valid: " + status.getStatus()); + + System.out.println("✓ Agent status retrieved"); + System.out.println(" Status: " + status.getStatus()); + System.out.println(" Job ID: " + start.getId()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testCancelAgent() { + System.out.println("\n=== Test: Cancel Agent ==="); + + AgentResponse start = client.startAgent( + AgentOptions.builder() + .prompt("Long-running research task") + .build()); + + Map result = client.cancelAgent(start.getId()); + + assertNotNull(result, "Cancel result should not be null"); + + System.out.println("✓ Agent cancelled successfully"); + System.out.println(" Job ID: " + start.getId()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testAgentWithStrictURLConstraints() { + System.out.println("\n=== Test: Agent with Strict URL Constraints ==="); + + AgentStatusResponse result = client.agent( + AgentOptions.builder() + .urls(List.of("https://docs.firecrawl.dev")) + .prompt("Extract API documentation structure") + .strictConstrainToURLs(true) + .build()); + + assertNotNull(result, "Agent result should not be null"); + assertTrue(List.of("completed", "failed").contains(result.getStatus()), + "Status should be completed or failed"); + + System.out.println("✓ Agent with strict constraints completed"); + System.out.println(" Strict URL constraint: true"); + System.out.println(" Status: " + result.getStatus()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testAgentWithMaxCredits() { + System.out.println("\n=== Test: Agent with Max Credits Limit ==="); + + AgentStatusResponse result = client.agent( + AgentOptions.builder() + .prompt("Quick research on Firecrawl") + .maxCredits(10) + .build()); + + assertNotNull(result, "Agent result should not be null"); + + System.out.println("✓ Agent with credit limit completed"); + System.out.println(" Max credits: 10"); + System.out.println(" Status: " + result.getStatus()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testAgentResearchTask() { + System.out.println("\n=== Test: Agent Research - Firecrawl Features ==="); + + AgentStatusResponse result = client.agent( + AgentOptions.builder() + .urls(List.of("https://firecrawl.dev", "https://docs.firecrawl.dev")) + .prompt("Research and summarize the key features of Firecrawl, including scraping, crawling, and extraction capabilities") + .build()); + + assertNotNull(result, "Agent result should not be null"); + assertEquals("completed", result.getStatus(), "Agent should complete successfully"); + assertNotNull(result.getData(), "Agent should return data"); + + System.out.println("✓ Research task completed"); + System.out.println(" Status: " + result.getStatus()); + System.out.println(" Data collected: ✓"); + + if (result.getData() != null) { + System.out.println(" Data summary: " + + result.getData().toString().substring(0, + Math.min(200, result.getData().toString().length())) + "..."); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testAgentComprehensive() { + System.out.println("\n=== Test: Agent with All Options ==="); + + Map schema = Map.of( + "type", "object", + "properties", Map.of( + "product_name", Map.of("type", "string"), + "features", Map.of( + "type", "array", + "items", Map.of("type", "string") + ), + "pricing", Map.of("type", "string") + ), + "required", List.of("product_name", "features") + ); + + AgentStatusResponse result = client.agent( + AgentOptions.builder() + .urls(List.of("https://firecrawl.dev")) + .prompt("Extract comprehensive product information including name, features, and pricing") + .schema(schema) + .maxCredits(20) + .strictConstrainToURLs(true) + .build()); + + assertNotNull(result, "Agent result should not be null"); + assertTrue(List.of("completed", "failed").contains(result.getStatus()), + "Status should be completed or failed"); + + System.out.println("✓ Comprehensive agent task completed"); + System.out.println(" Configuration:"); + System.out.println(" - URLs: 1"); + System.out.println(" - Schema: ✓"); + System.out.println(" - Max credits: 20"); + System.out.println(" - Strict constraints: true"); + System.out.println(" Results:"); + System.out.println(" - Status: " + result.getStatus()); + if (result.getData() != null) { + System.out.println(" - Data returned: ✓"); + } + } +} diff --git a/apps/java-sdk/src/test/java/com/firecrawl/BrowserTest.java b/apps/java-sdk/src/test/java/com/firecrawl/BrowserTest.java new file mode 100644 index 0000000000..9ea7b6aa6b --- /dev/null +++ b/apps/java-sdk/src/test/java/com/firecrawl/BrowserTest.java @@ -0,0 +1,310 @@ +package com.firecrawl; + +import com.firecrawl.client.FirecrawlClient; +import com.firecrawl.models.BrowserCreateResponse; +import com.firecrawl.models.BrowserDeleteResponse; +import com.firecrawl.models.BrowserExecuteResponse; +import com.firecrawl.models.BrowserListResponse; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Browser Sandbox Endpoint Tests + * + * Tests the browser session management functionality of the Firecrawl Java SDK. + * These tests require FIRECRAWL_API_KEY environment variable to be set. + * + * Run with: FIRECRAWL_API_KEY=fc-xxx gradle test --tests "com.firecrawl.BrowserTest" + */ +class BrowserTest { + + private static FirecrawlClient client; + + @BeforeAll + static void setup() { + String apiKey = System.getenv("FIRECRAWL_API_KEY"); + if (apiKey != null && !apiKey.isBlank()) { + client = FirecrawlClient.fromEnv(); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testBrowserCreateAndDelete() { + System.out.println("Testing browser session create and delete..."); + + // Create a browser session + BrowserCreateResponse createRes = client.browser(); + assertNotNull(createRes, "Create response should not be null"); + assertTrue(createRes.isSuccess(), "Create should succeed"); + assertNotNull(createRes.getId(), "Session ID should not be null"); + + String sessionId = createRes.getId(); + System.out.println(" Created session: " + sessionId); + + // Delete the browser session + BrowserDeleteResponse deleteRes = client.deleteBrowser(sessionId); + assertNotNull(deleteRes, "Delete response should not be null"); + assertTrue(deleteRes.isSuccess(), "Delete should succeed"); + + System.out.println("✓ Browser create and delete test passed"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testBrowserCreateWithOptions() { + System.out.println("Testing browser session create with options..."); + + // Create a session with custom TTL and activity TTL + BrowserCreateResponse createRes = client.browser(300, 120, true); + assertNotNull(createRes, "Create response should not be null"); + assertTrue(createRes.isSuccess(), "Create should succeed"); + assertNotNull(createRes.getId(), "Session ID should not be null"); + + String sessionId = createRes.getId(); + System.out.println(" Created session with options: " + sessionId); + + // Clean up + client.deleteBrowser(sessionId); + + System.out.println("✓ Browser create with options test passed"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testBrowserExecuteBash() { + System.out.println("Testing browser execute with bash..."); + + // Create a session + BrowserCreateResponse createRes = client.browser(); + assertTrue(createRes.isSuccess(), "Create should succeed"); + String sessionId = createRes.getId(); + + try { + // Execute bash code + BrowserExecuteResponse execRes = client.browserExecute(sessionId, "echo 'hello from java sdk'"); + assertNotNull(execRes, "Execute response should not be null"); + assertTrue(execRes.isSuccess(), "Execute should succeed"); + assertNotNull(execRes.getStdout(), "Stdout should not be null"); + assertTrue(execRes.getStdout().contains("hello from java sdk"), + "Stdout should contain our echo output"); + + System.out.println(" Stdout: " + execRes.getStdout().trim()); + System.out.println(" Exit code: " + execRes.getExitCode()); + } finally { + // Clean up + client.deleteBrowser(sessionId); + } + + System.out.println("✓ Browser execute bash test passed"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testBrowserExecuteNode() { + System.out.println("Testing browser execute with node..."); + + // Create a session + BrowserCreateResponse createRes = client.browser(); + assertTrue(createRes.isSuccess(), "Create should succeed"); + String sessionId = createRes.getId(); + + try { + // Execute node code + BrowserExecuteResponse execRes = client.browserExecute( + sessionId, "console.log(1 + 2)", "node", null); + assertNotNull(execRes, "Execute response should not be null"); + assertTrue(execRes.isSuccess(), "Execute should succeed"); + + System.out.println(" Stdout: " + (execRes.getStdout() != null ? execRes.getStdout().trim() : "null")); + } finally { + client.deleteBrowser(sessionId); + } + + System.out.println("✓ Browser execute node test passed"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testBrowserExecutePython() { + System.out.println("Testing browser execute with python..."); + + // Create a session + BrowserCreateResponse createRes = client.browser(); + assertTrue(createRes.isSuccess(), "Create should succeed"); + String sessionId = createRes.getId(); + + try { + // Execute python code + BrowserExecuteResponse execRes = client.browserExecute( + sessionId, "print('hello from python')", "python", null); + assertNotNull(execRes, "Execute response should not be null"); + assertTrue(execRes.isSuccess(), "Execute should succeed"); + + System.out.println(" Stdout: " + (execRes.getStdout() != null ? execRes.getStdout().trim() : "null")); + } finally { + client.deleteBrowser(sessionId); + } + + System.out.println("✓ Browser execute python test passed"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testBrowserExecuteWithTimeout() { + System.out.println("Testing browser execute with custom timeout..."); + + // Create a session + BrowserCreateResponse createRes = client.browser(); + assertTrue(createRes.isSuccess(), "Create should succeed"); + String sessionId = createRes.getId(); + + try { + // Execute with custom timeout (60 seconds) + BrowserExecuteResponse execRes = client.browserExecute( + sessionId, "echo 'timeout test'", "bash", 60); + assertNotNull(execRes, "Execute response should not be null"); + assertTrue(execRes.isSuccess(), "Execute should succeed"); + + System.out.println(" Stdout: " + (execRes.getStdout() != null ? execRes.getStdout().trim() : "null")); + } finally { + client.deleteBrowser(sessionId); + } + + System.out.println("✓ Browser execute with timeout test passed"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testBrowserListSessions() { + System.out.println("Testing list browser sessions..."); + + // List all sessions + BrowserListResponse listRes = client.listBrowsers(); + assertNotNull(listRes, "List response should not be null"); + assertTrue(listRes.isSuccess(), "List should succeed"); + + System.out.println(" Total sessions: " + (listRes.getSessions() != null ? listRes.getSessions().size() : 0)); + + System.out.println("✓ List browser sessions test passed"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testBrowserListActiveFilter() { + System.out.println("Testing list browser sessions with active filter..."); + + // Create a session so we have at least one active + BrowserCreateResponse createRes = client.browser(); + assertTrue(createRes.isSuccess(), "Create should succeed"); + String sessionId = createRes.getId(); + + try { + // List only active sessions + BrowserListResponse listRes = client.listBrowsers("active"); + assertNotNull(listRes, "List response should not be null"); + assertTrue(listRes.isSuccess(), "List should succeed"); + assertNotNull(listRes.getSessions(), "Sessions list should not be null"); + assertFalse(listRes.getSessions().isEmpty(), "Should have at least one active session"); + + System.out.println(" Active sessions: " + listRes.getSessions().size()); + } finally { + client.deleteBrowser(sessionId); + } + + System.out.println("✓ List active browser sessions test passed"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testBrowserFullLifecycle() { + System.out.println("Testing full browser session lifecycle..."); + + // 1. Create session + BrowserCreateResponse createRes = client.browser(300, 120, true); + assertTrue(createRes.isSuccess(), "Create should succeed"); + assertNotNull(createRes.getId(), "Should have session ID"); + String sessionId = createRes.getId(); + System.out.println(" 1. Created session: " + sessionId); + + // CDP URL and live view URL may be present + if (createRes.getCdpUrl() != null) { + System.out.println(" CDP URL present: true"); + } + if (createRes.getLiveViewUrl() != null) { + System.out.println(" Live View URL present: true"); + } + + // 2. Navigate to a page + BrowserExecuteResponse navRes = client.browserExecute( + sessionId, "agent-browser open https://example.com", "bash", 30); + assertTrue(navRes.isSuccess(), "Navigation should succeed"); + System.out.println(" 2. Navigated to example.com"); + + // 3. Take a snapshot + BrowserExecuteResponse snapRes = client.browserExecute( + sessionId, "agent-browser snapshot -i -c", "bash", 30); + assertTrue(snapRes.isSuccess(), "Snapshot should succeed"); + System.out.println(" 3. Took snapshot"); + + // 4. Get page title + BrowserExecuteResponse titleRes = client.browserExecute( + sessionId, "agent-browser get title", "bash", 30); + assertTrue(titleRes.isSuccess(), "Get title should succeed"); + System.out.println(" 4. Page title: " + (titleRes.getStdout() != null ? titleRes.getStdout().trim() : "null")); + + // 5. Verify session is active + BrowserListResponse listRes = client.listBrowsers("active"); + assertTrue(listRes.isSuccess(), "List should succeed"); + System.out.println(" 5. Active sessions: " + (listRes.getSessions() != null ? listRes.getSessions().size() : 0)); + + // 6. Delete session + BrowserDeleteResponse deleteRes = client.deleteBrowser(sessionId); + assertTrue(deleteRes.isSuccess(), "Delete should succeed"); + System.out.println(" 6. Deleted session"); + if (deleteRes.getSessionDurationMs() != null) { + System.out.println(" Session duration: " + deleteRes.getSessionDurationMs() + "ms"); + } + if (deleteRes.getCreditsBilled() != null) { + System.out.println(" Credits billed: " + deleteRes.getCreditsBilled()); + } + + System.out.println("✓ Full browser session lifecycle test passed"); + } + + @Test + void testBrowserExecuteRequiresSessionId() { + FirecrawlClient testClient = FirecrawlClient.builder() + .apiKey("fc-test-key") + .build(); + + assertThrows(NullPointerException.class, () -> + testClient.browserExecute(null, "echo test") + ); + } + + @Test + void testBrowserExecuteRequiresCode() { + FirecrawlClient testClient = FirecrawlClient.builder() + .apiKey("fc-test-key") + .build(); + + assertThrows(NullPointerException.class, () -> + testClient.browserExecute("some-session-id", null) + ); + } + + @Test + void testBrowserDeleteRequiresSessionId() { + FirecrawlClient testClient = FirecrawlClient.builder() + .apiKey("fc-test-key") + .build(); + + assertThrows(NullPointerException.class, () -> + testClient.deleteBrowser(null) + ); + } +} diff --git a/apps/java-sdk/src/test/java/com/firecrawl/CrawlTest.java b/apps/java-sdk/src/test/java/com/firecrawl/CrawlTest.java new file mode 100644 index 0000000000..efe984829b --- /dev/null +++ b/apps/java-sdk/src/test/java/com/firecrawl/CrawlTest.java @@ -0,0 +1,302 @@ +package com.firecrawl; + +import com.firecrawl.client.FirecrawlClient; +import com.firecrawl.errors.FirecrawlException; +import com.firecrawl.models.*; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; + +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Comprehensive Crawl Tests + * + * Tests the crawl functionality with various configurations. + * Based on Node.js SDK patterns and tested against live firecrawl.dev. + * + * Run with: FIRECRAWL_API_KEY=fc-xxx gradle test --tests "com.firecrawl.CrawlTest" + */ +class CrawlTest { + + private static FirecrawlClient client; + + @BeforeAll + static void setup() { + String apiKey = System.getenv("FIRECRAWL_API_KEY"); + if (apiKey != null && !apiKey.isBlank()) { + client = FirecrawlClient.fromEnv(); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testStartCrawlMinimal() { + System.out.println("\n=== Test: Start Crawl - Minimal Request ==="); + + CrawlResponse response = client.startCrawl("https://docs.firecrawl.dev", + CrawlOptions.builder() + .limit(3) + .build()); + + assertNotNull(response, "Crawl response should not be null"); + assertNotNull(response.getId(), "Crawl ID should not be null"); + assertNotNull(response.getUrl(), "Crawl URL should not be null"); + + System.out.println("✓ Crawl started successfully"); + System.out.println(" Job ID: " + response.getId()); + System.out.println(" Status URL: " + response.getUrl()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testStartCrawlWithOptions() { + System.out.println("\n=== Test: Start Crawl - With Options ==="); + + CrawlResponse response = client.startCrawl("https://docs.firecrawl.dev", + CrawlOptions.builder() + .limit(5) + .maxDiscoveryDepth(2) + .build()); + + assertNotNull(response.getId(), "Job ID should not be null"); + assertNotNull(response.getUrl(), "Status URL should not be null"); + + System.out.println("✓ Crawl with options started"); + System.out.println(" Limit: 5 pages"); + System.out.println(" Max depth: 2"); + System.out.println(" Job ID: " + response.getId()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testGetCrawlStatus() { + System.out.println("\n=== Test: Get Crawl Status ==="); + + // Start a crawl + CrawlResponse start = client.startCrawl("https://docs.firecrawl.dev", + CrawlOptions.builder() + .limit(3) + .build()); + + System.out.println("CrawlResponse: " + start); + System.out.println("ID: " + start.getId()); + assertNotNull(start, "CrawlResponse should not be null"); + assertNotNull(start.getId(), "Crawl ID should not be null"); + + // Get status + CrawlJob status = client.getCrawlStatus(start.getId()); + + assertNotNull(status, "Status should not be null"); + assertNotNull(status.getStatus(), "Status should not be null"); + assertTrue(List.of("scraping", "completed", "failed", "cancelled").contains(status.getStatus()), + "Status should be valid: " + status.getStatus()); + assertTrue(status.getCompleted() >= 0, "Completed count should be non-negative"); + // Data may be null while crawl is still in progress (status=scraping) + if ("completed".equals(status.getStatus())) { + assertNotNull(status.getData(), "Data should not be null when completed"); + } + + System.out.println("✓ Status retrieved successfully"); + System.out.println(" Status: " + status.getStatus()); + System.out.println(" Completed: " + status.getCompleted() + "/" + status.getTotal()); + System.out.println(" Documents: " + (status.getData() != null ? status.getData().size() : 0)); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testCancelCrawl() { + System.out.println("\n=== Test: Cancel Crawl ==="); + + CrawlResponse start = client.startCrawl("https://docs.firecrawl.dev", + CrawlOptions.builder() + .limit(10) + .build()); + + Map result = client.cancelCrawl(start.getId()); + + assertNotNull(result, "Cancel result should not be null"); + + System.out.println("✓ Crawl cancelled successfully"); + System.out.println(" Job ID: " + start.getId()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testCrawlWithWait() { + System.out.println("\n=== Test: Crawl with Wait (Blocking) ==="); + + CrawlJob job = client.crawl("https://firecrawl.dev", + CrawlOptions.builder() + .limit(3) + .maxDiscoveryDepth(1) + .build(), + 2, // pollInterval in seconds + 120 // timeout in seconds + ); + + assertNotNull(job, "Job should not be null"); + assertTrue(List.of("completed", "failed").contains(job.getStatus()), + "Final status should be completed or failed: " + job.getStatus()); + assertTrue(job.getCompleted() >= 0, "Completed count should be non-negative"); + assertTrue(job.getTotal() >= 0, "Total count should be non-negative"); + assertNotNull(job.getData(), "Data should not be null"); + + System.out.println("✓ Crawl completed (with wait)"); + System.out.println(" Final status: " + job.getStatus()); + System.out.println(" Pages crawled: " + job.getCompleted() + "/" + job.getTotal()); + System.out.println(" Documents returned: " + job.getData().size()); + + if (!job.getData().isEmpty()) { + Document firstDoc = job.getData().get(0); + System.out.println(" Sample URL: " + firstDoc.getMetadata().get("sourceURL")); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testCrawlWithScrapeOptions() { + System.out.println("\n=== Test: Crawl with Scrape Options ==="); + + CrawlResponse response = client.startCrawl("https://docs.firecrawl.dev", + CrawlOptions.builder() + .limit(2) + .scrapeOptions(ScrapeOptions.builder() + .formats(List.of("markdown", "links")) + .onlyMainContent(true) + .build()) + .build()); + + assertNotNull(response.getId(), "Job ID should not be null"); + + System.out.println("✓ Crawl with scrape options started"); + System.out.println(" Formats: markdown, links"); + System.out.println(" Only main content: true"); + System.out.println(" Job ID: " + response.getId()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testCrawlWithExcludePaths() { + System.out.println("\n=== Test: Crawl with Exclude Paths ==="); + + CrawlResponse response = client.startCrawl("https://docs.firecrawl.dev", + CrawlOptions.builder() + .limit(5) + .excludePaths(List.of("/blog/*", "/admin/*")) + .build()); + + assertNotNull(response.getId(), "Job ID should not be null"); + + System.out.println("✓ Crawl with exclude paths started"); + System.out.println(" Excluding: /blog/*, /admin/*"); + System.out.println(" Job ID: " + response.getId()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testCrawlWithIncludePaths() { + System.out.println("\n=== Test: Crawl with Include Paths ==="); + + CrawlResponse response = client.startCrawl("https://docs.firecrawl.dev", + CrawlOptions.builder() + .limit(5) + .includePaths(List.of("/docs/*")) + .build()); + + assertNotNull(response.getId(), "Job ID should not be null"); + + System.out.println("✓ Crawl with include paths started"); + System.out.println(" Including only: /docs/*"); + System.out.println(" Job ID: " + response.getId()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testCrawlWithAllowExternalLinks() { + System.out.println("\n=== Test: Crawl with Allow External Links ==="); + + CrawlResponse response = client.startCrawl("https://docs.firecrawl.dev", + CrawlOptions.builder() + .limit(5) + .allowExternalLinks(true) + .build()); + + assertNotNull(response.getId(), "Job ID should not be null"); + + System.out.println("✓ Crawl with external links allowed"); + System.out.println(" Job ID: " + response.getId()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testCrawlWithWebhookConfig() { + System.out.println("\n=== Test: Crawl with Webhook (if available) ==="); + + try { + // Using a test webhook URL (requestbin, webhook.site, etc.) + CrawlResponse response = client.startCrawl("https://firecrawl.dev", + CrawlOptions.builder() + .limit(2) + .webhook(WebhookConfig.builder() + .url("https://webhook.site/test") + .build()) + .build()); + + assertNotNull(response.getId(), "Job ID should not be null"); + + System.out.println("✓ Crawl with webhook started"); + System.out.println(" Job ID: " + response.getId()); + } catch (Exception e) { + System.out.println("⚠ Webhook test skipped or failed: " + e.getMessage()); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testCrawlFirecrawlHomepage() { + System.out.println("\n=== Test: Crawl Firecrawl.dev Homepage ==="); + + CrawlJob job = client.crawl("https://firecrawl.dev", + CrawlOptions.builder() + .limit(5) + .maxDiscoveryDepth(2) + .scrapeOptions(ScrapeOptions.builder() + .formats(List.of("markdown")) + .onlyMainContent(true) + .build()) + .build(), + 2, + 120 + ); + + assertNotNull(job, "Job should not be null"); + assertTrue(job.getData() != null && !job.getData().isEmpty(), + "Should have crawled at least one page"); + + // Verify content from Firecrawl site + boolean hasFirecrawlContent = job.getData().stream() + .anyMatch(doc -> { + String markdown = doc.getMarkdown(); + return markdown != null && + (markdown.toLowerCase().contains("firecrawl") || + markdown.toLowerCase().contains("scrape") || + markdown.toLowerCase().contains("crawl")); + }); + + assertTrue(hasFirecrawlContent, "Should contain Firecrawl-related content"); + + System.out.println("✓ Successfully crawled Firecrawl homepage"); + System.out.println(" Pages crawled: " + job.getData().size()); + System.out.println(" Status: " + job.getStatus()); + + // Print sample URLs + System.out.println(" Sample pages:"); + job.getData().stream() + .limit(3) + .forEach(doc -> System.out.println(" - " + doc.getMetadata().get("sourceURL"))); + } +} diff --git a/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlClientTest.java b/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlClientTest.java new file mode 100644 index 0000000000..bebed14ff0 --- /dev/null +++ b/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlClientTest.java @@ -0,0 +1,194 @@ +package com.firecrawl; + +import com.firecrawl.client.FirecrawlClient; +import com.firecrawl.errors.FirecrawlException; +import com.firecrawl.models.*; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Integration tests for the Firecrawl Java SDK. + * + *

These tests require a valid FIRECRAWL_API_KEY environment variable. + * Run with: FIRECRAWL_API_KEY=fc-xxx ./gradlew test + */ +class FirecrawlClientTest { + + @Test + void testBuilderRequiresApiKey() { + assertThrows(FirecrawlException.class, () -> + FirecrawlClient.builder().apiKey("").build() + ); + } + + @Test + void testBuilderAcceptsApiKey() { + // Should not throw — just validates construction + FirecrawlClient client = FirecrawlClient.builder() + .apiKey("fc-test-key") + .build(); + assertNotNull(client); + } + + @Test + void testScrapeOptionsBuilder() { + ScrapeOptions options = ScrapeOptions.builder() + .formats(List.of("markdown", "html")) + .onlyMainContent(true) + .timeout(30000) + .mobile(false) + .build(); + + assertEquals(List.of("markdown", "html"), options.getFormats()); + assertTrue(options.getOnlyMainContent()); + assertEquals(30000, options.getTimeout()); + assertFalse(options.getMobile()); + } + + @Test + void testCrawlOptionsBuilder() { + CrawlOptions options = CrawlOptions.builder() + .limit(100) + .maxDiscoveryDepth(3) + .sitemap("include") + .excludePaths(List.of("/admin/*")) + .build(); + + assertEquals(100, options.getLimit()); + assertEquals(3, options.getMaxDiscoveryDepth()); + assertEquals("include", options.getSitemap()); + assertEquals(List.of("/admin/*"), options.getExcludePaths()); + } + + @Test + void testAgentOptionsRequiresPrompt() { + assertThrows(IllegalArgumentException.class, () -> + AgentOptions.builder().build() + ); + } + + @Test + void testWebhookConfigRequiresUrl() { + assertThrows(IllegalArgumentException.class, () -> + WebhookConfig.builder().build() + ); + } + + @Test + void testScrapeOptionsToBuilder() { + ScrapeOptions original = ScrapeOptions.builder() + .formats(List.of("markdown")) + .timeout(5000) + .build(); + + ScrapeOptions modified = original.toBuilder() + .timeout(10000) + .build(); + + assertEquals(5000, original.getTimeout()); + assertEquals(10000, modified.getTimeout()); + assertEquals(List.of("markdown"), modified.getFormats()); + } + + @Test + void testBrowserExecuteRequiresSessionId() { + FirecrawlClient client = FirecrawlClient.builder() + .apiKey("fc-test-key") + .build(); + assertThrows(NullPointerException.class, () -> + client.browserExecute(null, "echo test") + ); + } + + @Test + void testBrowserDeleteRequiresSessionId() { + FirecrawlClient client = FirecrawlClient.builder() + .apiKey("fc-test-key") + .build(); + assertThrows(NullPointerException.class, () -> + client.deleteBrowser(null) + ); + } + + // ================================================================ + // E2E TESTS (require FIRECRAWL_API_KEY) + // ================================================================ + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testScrapeE2E() { + FirecrawlClient client = FirecrawlClient.fromEnv(); + Document doc = client.scrape("https://example.com", + ScrapeOptions.builder() + .formats(List.of("markdown")) + .build()); + + assertNotNull(doc); + assertNotNull(doc.getMarkdown()); + assertFalse(doc.getMarkdown().isEmpty()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testMapE2E() { + FirecrawlClient client = FirecrawlClient.fromEnv(); + MapData data = client.map("https://example.com", + MapOptions.builder() + .limit(10) + .build()); + + assertNotNull(data); + assertNotNull(data.getLinks()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testCrawlE2E() { + FirecrawlClient client = FirecrawlClient.fromEnv(); + CrawlJob job = client.crawl("https://example.com", + CrawlOptions.builder() + .limit(3) + .build(), + 2, 60); + + assertNotNull(job); + assertEquals("completed", job.getStatus()); + assertNotNull(job.getData()); + assertFalse(job.getData().isEmpty()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchE2E() { + FirecrawlClient client = FirecrawlClient.fromEnv(); + SearchData data = client.search("firecrawl web scraping", + SearchOptions.builder() + .limit(5) + .build()); + + assertNotNull(data); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testConcurrencyE2E() { + FirecrawlClient client = FirecrawlClient.fromEnv(); + ConcurrencyCheck check = client.getConcurrency(); + + assertNotNull(check); + assertTrue(check.getMaxConcurrency() > 0); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testCreditUsageE2E() { + FirecrawlClient client = FirecrawlClient.fromEnv(); + CreditUsage usage = client.getCreditUsage(); + + assertNotNull(usage); + } +} diff --git a/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlLiveSiteTest.java b/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlLiveSiteTest.java new file mode 100644 index 0000000000..54b3651ce8 --- /dev/null +++ b/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlLiveSiteTest.java @@ -0,0 +1,131 @@ +package com.firecrawl; + +import com.firecrawl.client.FirecrawlClient; +import com.firecrawl.models.Document; +import com.firecrawl.models.ScrapeOptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Live Site Test - Firecrawl.dev + * + * Tests the Java SDK against the actual Firecrawl production website. + * This demonstrates real-world usage of the API against live content. + * + * Run with: FIRECRAWL_API_KEY=fc-xxx gradle test --tests "com.firecrawl.FirecrawlLiveSiteTest" + */ +class FirecrawlLiveSiteTest { + + private static FirecrawlClient client; + + @BeforeAll + static void setup() { + String apiKey = System.getenv("FIRECRAWL_API_KEY"); + if (apiKey != null && !apiKey.isBlank()) { + client = FirecrawlClient.fromEnv(); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testScrapeFirecrawlHomepage() { + System.out.println("\n=== Testing against LIVE Firecrawl.dev website ===\n"); + System.out.println("Scraping: https://firecrawl.dev"); + + Document doc = client.scrape("https://firecrawl.dev", + ScrapeOptions.builder() + .formats(List.of("markdown", "html")) + .onlyMainContent(true) + .build()); + + // Assertions + assertNotNull(doc, "Document should not be null"); + assertNotNull(doc.getMarkdown(), "Markdown content should not be null"); + assertNotNull(doc.getHtml(), "HTML content should not be null"); + assertNotNull(doc.getMetadata(), "Metadata should not be null"); + + // Verify it's actually the Firecrawl site + String markdown = doc.getMarkdown().toLowerCase(); + assertTrue(markdown.contains("firecrawl") || markdown.contains("scrape") || markdown.contains("crawl"), + "Content should mention Firecrawl features"); + + // Check metadata + String sourceUrl = doc.getMetadata().get("sourceURL").toString(); + assertTrue(sourceUrl.contains("firecrawl.dev"), "Source URL should be firecrawl.dev"); + + // Display results + System.out.println("\n✓ Successfully scraped Firecrawl.dev!"); + System.out.println("\nMetadata:"); + System.out.println(" Source URL: " + sourceUrl); + if (doc.getMetadata().get("title") != null) { + System.out.println(" Title: " + doc.getMetadata().get("title")); + } + System.out.println(" Status Code: " + doc.getMetadata().get("statusCode")); + + System.out.println("\nContent Stats:"); + System.out.println(" Markdown length: " + doc.getMarkdown().length() + " characters"); + System.out.println(" HTML length: " + doc.getHtml().length() + " characters"); + + System.out.println("\nFirst 500 characters of markdown:"); + System.out.println(" " + doc.getMarkdown().substring(0, Math.min(500, doc.getMarkdown().length())).replace("\n", "\n ")); + + System.out.println("\n=== Live site test completed successfully! ===\n"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testScrapeFirecrawlPricing() { + System.out.println("\n=== Testing Firecrawl Pricing Page ===\n"); + System.out.println("Scraping: https://firecrawl.dev/pricing"); + + Document doc = client.scrape("https://firecrawl.dev/pricing", + ScrapeOptions.builder() + .formats(List.of("markdown")) + .build()); + + // Assertions + assertNotNull(doc, "Document should not be null"); + assertNotNull(doc.getMarkdown(), "Markdown content should not be null"); + + String markdown = doc.getMarkdown().toLowerCase(); + assertTrue(markdown.contains("pricing") || markdown.contains("plan") || markdown.contains("price"), + "Pricing page should contain pricing information"); + + System.out.println("✓ Successfully scraped pricing page!"); + System.out.println(" Content length: " + doc.getMarkdown().length() + " characters"); + System.out.println(" Source: " + doc.getMetadata().get("sourceURL")); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testScrapeFirecrawlDocs() { + System.out.println("\n=== Testing Firecrawl Documentation ===\n"); + System.out.println("Scraping: https://docs.firecrawl.dev"); + + Document doc = client.scrape("https://docs.firecrawl.dev", + ScrapeOptions.builder() + .formats(List.of("markdown")) + .waitFor(2000) // Wait for docs to load + .build()); + + // Assertions + assertNotNull(doc, "Document should not be null"); + assertNotNull(doc.getMarkdown(), "Markdown content should not be null"); + assertFalse(doc.getMarkdown().isEmpty(), "Markdown should not be empty"); + + String markdown = doc.getMarkdown().toLowerCase(); + assertTrue(markdown.contains("document") || markdown.contains("api") || markdown.contains("firecrawl"), + "Docs should contain documentation content"); + + System.out.println("✓ Successfully scraped documentation!"); + System.out.println(" Content length: " + doc.getMarkdown().length() + " characters"); + System.out.println(" Source: " + doc.getMetadata().get("sourceURL")); + + System.out.println("\n=== All Firecrawl.dev tests passed! ===\n"); + } +} diff --git a/apps/java-sdk/src/test/java/com/firecrawl/MapTest.java b/apps/java-sdk/src/test/java/com/firecrawl/MapTest.java new file mode 100644 index 0000000000..731de37eea --- /dev/null +++ b/apps/java-sdk/src/test/java/com/firecrawl/MapTest.java @@ -0,0 +1,279 @@ +package com.firecrawl; + +import com.firecrawl.client.FirecrawlClient; +import com.firecrawl.models.MapData; +import com.firecrawl.models.MapOptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; + +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Comprehensive Map Tests + * + * Tests the map functionality with various configurations. + * Based on Node.js SDK patterns and tested against live firecrawl.dev. + * + * Run with: FIRECRAWL_API_KEY=fc-xxx gradle test --tests "com.firecrawl.MapTest" + */ +class MapTest { + + private static FirecrawlClient client; + + @BeforeAll + static void setup() { + String apiKey = System.getenv("FIRECRAWL_API_KEY"); + if (apiKey != null && !apiKey.isBlank()) { + client = FirecrawlClient.fromEnv(); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testMapMinimal() { + System.out.println("\n=== Test: Map - Minimal Request ==="); + + MapData data = client.map("https://docs.firecrawl.dev"); + + assertNotNull(data, "Map data should not be null"); + assertNotNull(data.getLinks(), "Links should not be null"); + assertTrue(!data.getLinks().isEmpty(), "Should have at least one link"); + + // Verify link structure (v2 links are MapDocument objects with url, title, description) + Map firstLink = data.getLinks().get(0); + assertNotNull(firstLink, "Link should not be null"); + assertNotNull(firstLink.get("url"), "Link should have url"); + assertTrue(firstLink.get("url").toString().startsWith("http"), "URL should start with http"); + + System.out.println("✓ Map completed successfully"); + System.out.println(" Total links found: " + data.getLinks().size()); + System.out.println(" Sample URL: " + firstLink.get("url")); + if (firstLink.get("title") != null) { + System.out.println(" Title: " + firstLink.get("title")); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testMapWithLimit() { + System.out.println("\n=== Test: Map with Limit ==="); + + MapData data = client.map("https://docs.firecrawl.dev", + MapOptions.builder() + .limit(10) + .build()); + + assertNotNull(data.getLinks(), "Links should not be null"); + assertTrue(data.getLinks().size() <= 10, + "Should respect limit of 10: got " + data.getLinks().size()); + + System.out.println("✓ Map with limit completed"); + System.out.println(" Requested limit: 10"); + System.out.println(" Actual links: " + data.getLinks().size()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testMapWithSearch() { + System.out.println("\n=== Test: Map with Search Filter ==="); + + MapData data = client.map("https://docs.firecrawl.dev", + MapOptions.builder() + .search("api") + .limit(20) + .build()); + + assertNotNull(data.getLinks(), "Links should not be null"); + + // Verify that filtered results contain the search term + long matchingLinks = data.getLinks().stream() + .filter(link -> { + String url = link.get("url") != null ? link.get("url").toString().toLowerCase() : ""; + String title = link.get("title") != null ? link.get("title").toString().toLowerCase() : ""; + return url.contains("api") || title.contains("api"); + }) + .count(); + + System.out.println("✓ Map with search completed"); + System.out.println(" Total links: " + data.getLinks().size()); + System.out.println(" Links matching 'api': " + matchingLinks); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testMapWithSkipSitemap() { + System.out.println("\n=== Test: Map with Sitemap Skip ==="); + + MapData data = client.map("https://firecrawl.dev", + MapOptions.builder() + .sitemap("skip") + .limit(15) + .build()); + + assertNotNull(data.getLinks(), "Links should not be null"); + assertTrue(data.getLinks().size() <= 15, "Should respect limit"); + + // Verify all links are valid HTTP(S) URLs + boolean allValidUrls = data.getLinks().stream() + .allMatch(link -> { + String url = link.get("url") != null ? link.get("url").toString() : ""; + return url.startsWith("http://") || url.startsWith("https://"); + }); + + assertTrue(allValidUrls, "All URLs should be valid HTTP(S)"); + + System.out.println("✓ Map with sitemap=skip completed"); + System.out.println(" Links found: " + data.getLinks().size()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testMapWithSitemapOnly() { + System.out.println("\n=== Test: Map with Sitemap Only ==="); + + MapData data = client.map("https://firecrawl.dev", + MapOptions.builder() + .sitemap("only") + .limit(50) + .build()); + + assertNotNull(data.getLinks(), "Links should not be null"); + // Note: sitemapOnly may not always respect the limit strictly + + // Verify all links are valid HTTP(S) URLs + boolean allValidUrls = data.getLinks().stream() + .allMatch(link -> { + String url = link.get("url") != null ? link.get("url").toString() : ""; + return url.startsWith("http://") || url.startsWith("https://"); + }); + + assertTrue(allValidUrls, "All URLs should be valid HTTP(S)"); + + System.out.println("✓ Map with sitemap=only completed"); + System.out.println(" Links found: " + data.getLinks().size()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testMapWithIncludeSubdomains() { + System.out.println("\n=== Test: Map with Include Subdomains ==="); + + MapData data = client.map("https://firecrawl.dev", + MapOptions.builder() + .includeSubdomains(true) + .limit(20) + .build()); + + assertNotNull(data.getLinks(), "Links should not be null"); + + System.out.println("✓ Map with subdomains completed"); + System.out.println(" Total links: " + data.getLinks().size()); + + // Check if any subdomains were found + boolean hasSubdomains = data.getLinks().stream() + .anyMatch(link -> { + String url = link.get("url") != null ? link.get("url").toString() : ""; + return url.contains("docs.firecrawl.dev") || + url.contains("api.firecrawl.dev") || + (url.contains(".firecrawl.dev") && !url.contains("www.firecrawl.dev")); + }); + + if (hasSubdomains) { + System.out.println(" ✓ Found subdomain links"); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testMapFirecrawlDocs() { + System.out.println("\n=== Test: Map Firecrawl Documentation ==="); + + MapData data = client.map("https://docs.firecrawl.dev", + MapOptions.builder() + .limit(50) + .build()); + + assertNotNull(data.getLinks(), "Links should not be null"); + assertFalse(data.getLinks().isEmpty(), "Should find documentation links"); + + System.out.println("✓ Mapped Firecrawl documentation"); + System.out.println(" Total links: " + data.getLinks().size()); + + // Print sample links + System.out.println(" Sample documentation pages:"); + data.getLinks().stream() + .limit(5) + .forEach(link -> System.out.println(" - " + link.get("url"))); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testMapLinkStructure() { + System.out.println("\n=== Test: Verify Map Link Structure ==="); + + MapData data = client.map("https://firecrawl.dev", + MapOptions.builder() + .limit(5) + .build()); + + assertNotNull(data.getLinks(), "Links should not be null"); + assertFalse(data.getLinks().isEmpty(), "Should have links"); + + // Verify each link is a valid URL with expected fields + for (Map link : data.getLinks()) { + assertNotNull(link, "Link should not be null"); + assertNotNull(link.get("url"), "Link should have url field"); + assertTrue(link.get("url").toString().startsWith("http"), "URL should be valid: " + link.get("url")); + } + + System.out.println("✓ All links have correct structure"); + System.out.println(" Verified " + data.getLinks().size() + " links"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testMapWithTimeout() { + System.out.println("\n=== Test: Map with Timeout ==="); + + MapData data = client.map("https://firecrawl.dev", + MapOptions.builder() + .timeout(15000) // 15 seconds + .limit(10) + .build()); + + assertNotNull(data.getLinks(), "Links should not be null"); + + System.out.println("✓ Map with timeout completed"); + System.out.println(" Timeout: 15000ms"); + System.out.println(" Links found: " + data.getLinks().size()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testMapComprehensive() { + System.out.println("\n=== Test: Map with All Options ==="); + + MapData data = client.map("https://docs.firecrawl.dev", + MapOptions.builder() + .includeSubdomains(false) + .limit(25) + .sitemap("include") + .timeout(20000) + .build()); + + assertNotNull(data.getLinks(), "Links should not be null"); + assertTrue(data.getLinks().size() <= 25, "Should respect limit"); + + System.out.println("✓ Comprehensive map completed"); + System.out.println(" Configuration:"); + System.out.println(" - Include subdomains: false"); + System.out.println(" - Limit: 25"); + System.out.println(" - Ignore sitemap: false"); + System.out.println(" - Timeout: 20000ms"); + System.out.println(" Results:"); + System.out.println(" - Links found: " + data.getLinks().size()); + } +} diff --git a/apps/java-sdk/src/test/java/com/firecrawl/ScrapeTest.java b/apps/java-sdk/src/test/java/com/firecrawl/ScrapeTest.java new file mode 100644 index 0000000000..c91cbc30a3 --- /dev/null +++ b/apps/java-sdk/src/test/java/com/firecrawl/ScrapeTest.java @@ -0,0 +1,177 @@ +package com.firecrawl; + +import com.firecrawl.client.FirecrawlClient; +import com.firecrawl.errors.FirecrawlException; +import com.firecrawl.models.Document; +import com.firecrawl.models.ScrapeOptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Scrape Endpoint Tests + * + * Tests the scrape functionality of the Firecrawl Java SDK. + * These tests require FIRECRAWL_API_KEY environment variable to be set. + * + * Run with: FIRECRAWL_API_KEY=fc-xxx gradle test --tests "com.firecrawl.ScrapeTest" + */ +class ScrapeTest { + + private static FirecrawlClient client; + + @BeforeAll + static void setup() { + // Initialize client from environment variable + String apiKey = System.getenv("FIRECRAWL_API_KEY"); + if (apiKey != null && !apiKey.isBlank()) { + client = FirecrawlClient.fromEnv(); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testScrapeBasic() { + // Test basic scraping with markdown format + System.out.println("Testing basic scrape with markdown format..."); + + Document doc = client.scrape("https://example.com", + ScrapeOptions.builder() + .formats(List.of("markdown")) + .build()); + + // Assertions + assertNotNull(doc, "Document should not be null"); + assertNotNull(doc.getMarkdown(), "Markdown content should not be null"); + assertFalse(doc.getMarkdown().isEmpty(), "Markdown content should not be empty"); + + System.out.println("✓ Basic scrape test passed"); + System.out.println(" Markdown length: " + doc.getMarkdown().length() + " characters"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testScrapeWithMultipleFormats() { + // Test scraping with multiple formats + System.out.println("Testing scrape with multiple formats (markdown + html)..."); + + Document doc = client.scrape("https://example.com", + ScrapeOptions.builder() + .formats(List.of("markdown", "html")) + .build()); + + // Assertions + assertNotNull(doc, "Document should not be null"); + assertNotNull(doc.getMarkdown(), "Markdown content should not be null"); + assertNotNull(doc.getHtml(), "HTML content should not be null"); + assertFalse(doc.getMarkdown().isEmpty(), "Markdown should not be empty"); + assertFalse(doc.getHtml().isEmpty(), "HTML should not be empty"); + + System.out.println("✓ Multiple formats test passed"); + System.out.println(" Markdown length: " + doc.getMarkdown().length()); + System.out.println(" HTML length: " + doc.getHtml().length()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testScrapeWithMetadata() { + // Test that metadata is properly extracted + System.out.println("Testing scrape with metadata extraction..."); + + Document doc = client.scrape("https://example.com", + ScrapeOptions.builder() + .formats(List.of("markdown")) + .build()); + + // Assertions + assertNotNull(doc.getMetadata(), "Metadata should not be null"); + assertNotNull(doc.getMetadata().get("sourceURL"), "Source URL should be in metadata"); + assertTrue(doc.getMetadata().get("sourceURL").toString().contains("example.com"), + "Source URL should contain example.com"); + + System.out.println("✓ Metadata extraction test passed"); + System.out.println(" Source URL: " + doc.getMetadata().get("sourceURL")); + if (doc.getMetadata().get("title") != null) { + System.out.println(" Title: " + doc.getMetadata().get("title")); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testScrapeWithOnlyMainContent() { + // Test scraping with onlyMainContent option + System.out.println("Testing scrape with onlyMainContent option..."); + + Document doc = client.scrape("https://example.com", + ScrapeOptions.builder() + .formats(List.of("markdown")) + .onlyMainContent(true) + .build()); + + // Assertions + assertNotNull(doc, "Document should not be null"); + assertNotNull(doc.getMarkdown(), "Markdown content should not be null"); + assertFalse(doc.getMarkdown().isEmpty(), "Markdown should not be empty"); + + System.out.println("✓ Only main content test passed"); + System.out.println(" Content length: " + doc.getMarkdown().length()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testScrapeWithTimeout() { + // Test scraping with custom timeout + System.out.println("Testing scrape with custom timeout..."); + + Document doc = client.scrape("https://example.com", + ScrapeOptions.builder() + .formats(List.of("markdown")) + .timeout(10000) // 10 seconds + .build()); + + // Assertions + assertNotNull(doc, "Document should not be null"); + assertNotNull(doc.getMarkdown(), "Markdown should not be null"); + + System.out.println("✓ Timeout configuration test passed"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testScrapeInvalidUrl() { + // Test that invalid URLs are handled properly + System.out.println("Testing scrape with invalid URL..."); + + assertThrows(FirecrawlException.class, () -> { + client.scrape("not-a-valid-url", + ScrapeOptions.builder() + .formats(List.of("markdown")) + .build()); + }, "Should throw FirecrawlException for invalid URL"); + + System.out.println("✓ Invalid URL handling test passed"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testScrapeWithWaitFor() { + // Test scraping with waitFor option (useful for dynamic content) + System.out.println("Testing scrape with waitFor option..."); + + Document doc = client.scrape("https://example.com", + ScrapeOptions.builder() + .formats(List.of("markdown")) + .waitFor(1000) // Wait 1 second for page to load + .build()); + + // Assertions + assertNotNull(doc, "Document should not be null"); + assertNotNull(doc.getMarkdown(), "Markdown should not be null"); + + System.out.println("✓ WaitFor option test passed"); + } +} diff --git a/apps/java-sdk/src/test/java/com/firecrawl/SearchTest.java b/apps/java-sdk/src/test/java/com/firecrawl/SearchTest.java new file mode 100644 index 0000000000..46c977f797 --- /dev/null +++ b/apps/java-sdk/src/test/java/com/firecrawl/SearchTest.java @@ -0,0 +1,337 @@ +package com.firecrawl; + +import com.firecrawl.client.FirecrawlClient; +import com.firecrawl.models.*; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; + +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Comprehensive Search Tests + * + * Tests the search functionality with various configurations. + * Based on Node.js SDK patterns and tested against live firecrawl.dev. + * + * Run with: FIRECRAWL_API_KEY=fc-xxx gradle test --tests "com.firecrawl.SearchTest" + */ +class SearchTest { + + private static FirecrawlClient client; + + @BeforeAll + static void setup() { + String apiKey = System.getenv("FIRECRAWL_API_KEY"); + if (apiKey != null && !apiKey.isBlank()) { + client = FirecrawlClient.fromEnv(); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchMinimal() { + System.out.println("\n=== Test: Search - Minimal Request ==="); + + SearchData results = client.search("What is Firecrawl?"); + + assertNotNull(results, "Search results should not be null"); + assertNotNull(results.getWeb(), "Web results should not be null"); + assertTrue(!results.getWeb().isEmpty(), "Should have at least one web result"); + + // Verify result structure + Map firstResult = results.getWeb().get(0); + assertNotNull(firstResult.get("url"), "Result should have URL"); + assertTrue(firstResult.get("url").toString().startsWith("http"), + "URL should be valid"); + + System.out.println("✓ Search completed successfully"); + System.out.println(" Web results: " + results.getWeb().size()); + System.out.println(" Sample result: " + firstResult.get("url")); + if (firstResult.get("title") != null) { + System.out.println(" Title: " + firstResult.get("title")); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchWithLimit() { + System.out.println("\n=== Test: Search with Limit ==="); + + SearchData results = client.search("artificial intelligence", + SearchOptions.builder() + .limit(5) + .build()); + + assertNotNull(results.getWeb(), "Web results should not be null"); + assertTrue(results.getWeb().size() <= 5, + "Should respect limit of 5: got " + results.getWeb().size()); + + System.out.println("✓ Search with limit completed"); + System.out.println(" Requested limit: 5"); + System.out.println(" Actual results: " + results.getWeb().size()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchWithMultipleSources() { + System.out.println("\n=== Test: Search with Multiple Sources ==="); + + SearchData results = client.search("Firecrawl web scraping", + SearchOptions.builder() + .sources(List.of("web", "news")) + .limit(3) + .build()); + + assertNotNull(results.getWeb(), "Web results should not be null"); + assertTrue(results.getWeb().size() <= 3, "Web results should respect limit"); + + System.out.println("✓ Multi-source search completed"); + System.out.println(" Web results: " + results.getWeb().size()); + if (results.getNews() != null) { + System.out.println(" News results: " + results.getNews().size()); + } else { + System.out.println(" News results: 0"); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchResultStructure() { + System.out.println("\n=== Test: Verify Search Result Structure ==="); + + SearchData results = client.search("test query", + SearchOptions.builder() + .limit(1) + .build()); + + assertNotNull(results.getWeb(), "Web results should not be null"); + + if (!results.getWeb().isEmpty()) { + Map result = results.getWeb().get(0); + + assertNotNull(result.get("url"), "Result must have URL"); + assertTrue(result.get("url") instanceof String, "URL should be string"); + assertTrue(result.get("url").toString().startsWith("http"), + "URL should be valid"); + + // Title and description may be null but if present should be strings + if (result.get("title") != null) { + assertTrue(result.get("title") instanceof String, + "Title should be string"); + } + if (result.get("description") != null) { + assertTrue(result.get("description") instanceof String, + "Description should be string"); + } + + System.out.println("✓ Result structure verified"); + System.out.println(" URL: ✓"); + System.out.println(" Title: " + (result.get("title") != null ? "✓" : "null")); + System.out.println(" Description: " + (result.get("description") != null ? "✓" : "null")); + } + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchWithLocation() { + System.out.println("\n=== Test: Search with Location ==="); + + SearchData results = client.search("restaurants near me", + SearchOptions.builder() + .location("US") + .limit(5) + .build()); + + assertNotNull(results.getWeb(), "Web results should not be null"); + + System.out.println("✓ Search with location completed"); + System.out.println(" Location: US"); + System.out.println(" Results: " + results.getWeb().size()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchWithTimeFilter() { + System.out.println("\n=== Test: Search with Time Filter ==="); + + SearchData results = client.search("latest AI news", + SearchOptions.builder() + .tbs("qdr:m") // Past month + .limit(5) + .build()); + + assertNotNull(results.getWeb(), "Web results should not be null"); + + System.out.println("✓ Search with time filter completed"); + System.out.println(" Time filter: Past month (qdr:m)"); + System.out.println(" Results: " + results.getWeb().size()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchWithScrapeOptions() { + System.out.println("\n=== Test: Search with Scrape Options ==="); + + SearchData results = client.search("Firecrawl documentation", + SearchOptions.builder() + .limit(2) + .scrapeOptions(ScrapeOptions.builder() + .formats(List.of("markdown")) + .onlyMainContent(true) + .build()) + .build()); + + assertNotNull(results.getWeb(), "Web results should not be null"); + + // When scrapeOptions with markdown format are provided, results should include markdown content + if (!results.getWeb().isEmpty()) { + Map first = results.getWeb().get(0); + Object markdown = first.get("markdown"); + assertNotNull(markdown, "Scraped result should contain markdown content when formats=[markdown]"); + assertFalse(markdown.toString().isEmpty(), "Markdown content should not be empty"); + } + + System.out.println("✓ Search with scrape options completed"); + System.out.println(" Results: " + results.getWeb().size()); + System.out.println(" Scrape formats: markdown"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchFirecrawlSpecific() { + System.out.println("\n=== Test: Search for Firecrawl ==="); + + SearchData results = client.search("Firecrawl web scraping API", + SearchOptions.builder() + .limit(10) + .build()); + + assertNotNull(results.getWeb(), "Web results should not be null"); + assertFalse(results.getWeb().isEmpty(), "Should find Firecrawl results"); + + // Verify results contain Firecrawl-related content + boolean hasFirecrawlContent = results.getWeb().stream() + .anyMatch(result -> { + String url = result.get("url").toString().toLowerCase(); + String title = result.get("title") != null ? + result.get("title").toString().toLowerCase() : ""; + String desc = result.get("description") != null ? + result.get("description").toString().toLowerCase() : ""; + + return url.contains("firecrawl") || + title.contains("firecrawl") || + desc.contains("firecrawl"); + }); + + assertTrue(hasFirecrawlContent, "Results should mention Firecrawl"); + + System.out.println("✓ Firecrawl search completed"); + System.out.println(" Total results: " + results.getWeb().size()); + System.out.println(" Results mentioning Firecrawl: ✓"); + + // Print sample results + System.out.println(" Sample results:"); + results.getWeb().stream() + .limit(3) + .forEach(result -> { + System.out.println(" - " + result.get("title")); + System.out.println(" " + result.get("url")); + }); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchComprehensive() { + System.out.println("\n=== Test: Search with All Options ==="); + + SearchData results = client.search("web scraping tools", + SearchOptions.builder() + .sources(List.of("web")) + .limit(5) + .tbs("qdr:y") // Past year + .location("US") + .timeout(30000) + .scrapeOptions(ScrapeOptions.builder() + .formats(List.of("markdown")) + .onlyMainContent(true) + .waitFor(1000) + .build()) + .build()); + + assertNotNull(results.getWeb(), "Web results should not be null"); + assertTrue(results.getWeb().size() <= 5, "Should respect limit"); + + System.out.println("✓ Comprehensive search completed"); + System.out.println(" Configuration:"); + System.out.println(" - Sources: web"); + System.out.println(" - Limit: 5"); + System.out.println(" - Time filter: Past year"); + System.out.println(" - Location: US"); + System.out.println(" - Timeout: 30000ms"); + System.out.println(" - Scrape: markdown, main content only"); + System.out.println(" Results:"); + System.out.println(" - Web results: " + results.getWeb().size()); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchContentVerification() { + System.out.println("\n=== Test: Search Content Verification ==="); + + SearchData results = client.search("Python programming language", + SearchOptions.builder() + .limit(5) + .build()); + + assertNotNull(results.getWeb(), "Web results should not be null"); + assertFalse(results.getWeb().isEmpty(), "Should have results"); + + // Verify results are relevant to the query + boolean hasRelevantContent = results.getWeb().stream() + .anyMatch(result -> { + String text = String.format("%s %s %s", + result.get("url"), + result.get("title"), + result.get("description") + ).toLowerCase(); + return text.contains("python"); + }); + + assertTrue(hasRelevantContent, "Results should be relevant to query"); + + System.out.println("✓ Content verification passed"); + System.out.println(" Query: Python programming language"); + System.out.println(" Relevant results found: ✓"); + } + + @Test + @EnabledIfEnvironmentVariable(named = "FIRECRAWL_API_KEY", matches = ".*\\S.*") + void testSearchIgnoreInvalidURLs() { + System.out.println("\n=== Test: Search with Ignore Invalid URLs ==="); + + SearchData results = client.search("technology news", + SearchOptions.builder() + .limit(5) + .ignoreInvalidURLs(true) + .build()); + + assertNotNull(results.getWeb(), "Web results should not be null"); + + // Verify all URLs are valid + boolean allValidUrls = results.getWeb().stream() + .allMatch(result -> { + String url = result.get("url").toString(); + return url.startsWith("http://") || url.startsWith("https://"); + }); + + assertTrue(allValidUrls, "All URLs should be valid HTTP(S)"); + + System.out.println("✓ Search with URL validation completed"); + System.out.println(" Results: " + results.getWeb().size()); + System.out.println(" All URLs valid: ✓"); + } +}