From 6481d35ea5bc6bce48bd3b2abf6976f3413aa441 Mon Sep 17 00:00:00 2001 From: Selinali01 Date: Wed, 29 Apr 2026 00:48:39 -0700 Subject: [PATCH] IG reel scraper apify --- packages/bubble-core/package.json | 2 +- .../apify/actors/instagram-reel-scraper.ts | 104 ++++++++ .../apify/apify-scraper.schema.ts | 12 + .../src/bubbles/service-bubble/apify/apify.ts | 4 +- .../src/bubbles/tool-bubble/instagram-tool.ts | 250 +++++++++++++++++- packages/bubble-runtime/package.json | 2 +- packages/bubble-scope-manager/package.json | 2 +- packages/bubble-shared-schemas/package.json | 2 +- packages/create-bubblelab-app/package.json | 2 +- .../templates/basic/package.json | 6 +- .../templates/reddit-scraper/package.json | 4 +- 11 files changed, 369 insertions(+), 21 deletions(-) create mode 100644 packages/bubble-core/src/bubbles/service-bubble/apify/actors/instagram-reel-scraper.ts diff --git a/packages/bubble-core/package.json b/packages/bubble-core/package.json index ea342e85..395c6261 100644 --- a/packages/bubble-core/package.json +++ b/packages/bubble-core/package.json @@ -1,6 +1,6 @@ { "name": "@bubblelab/bubble-core", - "version": "0.1.321", + "version": "0.1.322", "type": "module", "license": "Apache-2.0", "main": "./dist/index.js", diff --git a/packages/bubble-core/src/bubbles/service-bubble/apify/actors/instagram-reel-scraper.ts b/packages/bubble-core/src/bubbles/service-bubble/apify/actors/instagram-reel-scraper.ts new file mode 100644 index 00000000..c8c7e0fd --- /dev/null +++ b/packages/bubble-core/src/bubbles/service-bubble/apify/actors/instagram-reel-scraper.ts @@ -0,0 +1,104 @@ +import { z } from 'zod'; +import { InstagramPostSchema } from './instagram-scraper.js'; + +// ============================================================================ +// INSTAGRAM REEL SCRAPER SCHEMAS +// ============================================================================ + +export const InstagramReelScraperInputSchema = z.object({ + username: z + .array(z.string()) + .min( + 1, + 'At least one username, profile URL, profile ID, or reel URL is required' + ) + .describe( + 'Instagram usernames, profile URLs, profile IDs, or direct reel URLs. Examples: ["ryanbailey.cb"], ["https://www.instagram.com/ryanbailey.cb/"], ["https://www.instagram.com/p/DXIlvPbj2PY/"]' + ), + + resultsLimit: z + .number() + .min(1) + .default(20) + .describe('Maximum number of reels to scrape per profile (min 1)'), + + onlyPostsNewerThan: z + .string() + .optional() + .describe( + 'Only return reels posted on or after this date. Accepts YYYY-MM-DD, ISO timestamp, or relative time like "1 day" / "2 weeks"' + ), + + skipPinnedPosts: z + .boolean() + .default(false) + .optional() + .describe('Exclude pinned reels from the results'), + + includeSharesCount: z + .boolean() + .default(false) + .optional() + .describe('Extract the number of shares for each reel (paid add-on)'), + + includeTranscript: z + .boolean() + .default(false) + .optional() + .describe( + 'Extract an auto-generated text transcript of the reel audio (paid add-on)' + ), + + includeDownloadedVideo: z + .boolean() + .default(false) + .optional() + .describe('Include a direct MP4 download URL for each reel (paid add-on)'), +}); + +// Reel scraper extends the base post schema with reel-specific fields +export const InstagramReelScraperItemSchema = InstagramPostSchema.extend({ + inputUrl: z + .string() + .optional() + .describe('Original input profile or reel URL'), + ownerFullName: z.string().optional().describe('Reel owner full name'), + sharesCount: z + .number() + .optional() + .describe('Number of shares (only present when includeSharesCount=true)'), + videoPlayCount: z.number().optional().describe('Number of video plays'), + videoDuration: z.number().optional().describe('Reel length in seconds'), + videoUrl: z.string().optional().describe('CDN video URL'), + downloadedVideo: z + .string() + .optional() + .describe( + 'Direct MP4 download URL (only present when includeDownloadedVideo=true)' + ), + transcript: z + .string() + .optional() + .describe( + 'Auto-generated speech transcript (only present when includeTranscript=true)' + ), + firstComment: z.string().optional().describe('First/top comment text'), + latestComments: z + .array(z.unknown()) + .optional() + .describe('Array of latest comments with owner, text, likes, replies'), + coauthorProducers: z + .array(z.unknown()) + .optional() + .describe('Co-creator information'), + musicInfo: z + .object({ + artist_name: z.string().optional(), + song_name: z.string().optional(), + uses_original_audio: z.boolean().optional(), + audio_id: z.string().optional(), + }) + .passthrough() + .optional() + .describe('Music/audio information for the reel'), +}); diff --git a/packages/bubble-core/src/bubbles/service-bubble/apify/apify-scraper.schema.ts b/packages/bubble-core/src/bubbles/service-bubble/apify/apify-scraper.schema.ts index bf09e41a..682be581 100644 --- a/packages/bubble-core/src/bubbles/service-bubble/apify/apify-scraper.schema.ts +++ b/packages/bubble-core/src/bubbles/service-bubble/apify/apify-scraper.schema.ts @@ -6,6 +6,10 @@ import { InstagramHashtagScraperInputSchema, InstagramHashtagScraperItemSchema, } from './actors/instagram-hashtag-scraper.js'; +import { + InstagramReelScraperInputSchema, + InstagramReelScraperItemSchema, +} from './actors/instagram-reel-scraper.js'; import { LinkedInProfilePostsInputSchema, LinkedInProfilePostsOutputSchema, @@ -66,6 +70,14 @@ export const APIFY_ACTOR_SCHEMAS = { documentation: 'https://apify.com/apify/instagram-hashtag-scraper', category: 'social-media', }, + 'apify/instagram-reel-scraper': { + input: InstagramReelScraperInputSchema, + output: InstagramReelScraperItemSchema, + description: + 'Scrape Instagram reels from profiles or direct reel URLs with optional transcript, share counts, and downloaded MP4 URLs', + documentation: 'https://apify.com/apify/instagram-reel-scraper', + category: 'social-media', + }, 'apimaestro/linkedin-profile-posts': { input: LinkedInProfilePostsInputSchema, output: LinkedInProfilePostsOutputSchema, diff --git a/packages/bubble-core/src/bubbles/service-bubble/apify/apify.ts b/packages/bubble-core/src/bubbles/service-bubble/apify/apify.ts index 92cd0502..2be75cb0 100644 --- a/packages/bubble-core/src/bubbles/service-bubble/apify/apify.ts +++ b/packages/bubble-core/src/bubbles/service-bubble/apify/apify.ts @@ -52,11 +52,11 @@ const ApifyParamsSchema = z.object({ timeout: z .number() .min(1000) - .max(500000) + .max(1800000) .optional() .default(300000) .describe( - 'Maximum time to wait for actor completion in milliseconds (default: 120000)' + 'Maximum time to wait for actor completion in milliseconds (default: 300000 = 5 min, max: 1800000 = 30 min). Long-running actors (e.g., apify/instagram-reel-scraper with transcripts) need >300s.' ), credentials: z .record(z.nativeEnum(CredentialType), z.string()) diff --git a/packages/bubble-core/src/bubbles/tool-bubble/instagram-tool.ts b/packages/bubble-core/src/bubbles/tool-bubble/instagram-tool.ts index 574d2f03..8518126e 100644 --- a/packages/bubble-core/src/bubbles/tool-bubble/instagram-tool.ts +++ b/packages/bubble-core/src/bubbles/tool-bubble/instagram-tool.ts @@ -32,12 +32,51 @@ const InstagramProfileSchema = z.object({ profilePicUrl: z.string().nullable().describe('Profile picture URL'), }); +const InstagramReelSchema = z.object({ + url: z.string().nullable().describe('Reel URL'), + shortCode: z.string().nullable().describe('Instagram short code'), + caption: z.string().nullable().describe('Reel caption'), + ownerUsername: z.string().nullable().describe('Reel owner username'), + ownerFullName: z.string().nullable().describe('Reel owner full name'), + timestamp: z.string().nullable().describe('Reel timestamp (ISO format)'), + videoUrl: z.string().nullable().describe('CDN video URL'), + downloadedVideo: z + .string() + .nullable() + .describe( + 'Direct MP4 download URL — only populated when includeDownloadedVideo=true' + ), + videoDuration: z.number().nullable().describe('Reel duration in seconds'), + videoViewCount: z.number().nullable().describe('Video view count'), + videoPlayCount: z.number().nullable().describe('Video play count'), + likesCount: z.number().nullable().describe('Number of likes'), + commentsCount: z.number().nullable().describe('Number of comments'), + sharesCount: z + .number() + .nullable() + .describe('Number of shares — only populated when includeSharesCount=true'), + hashtags: z.array(z.string()).nullable().describe('Hashtags in the reel'), + mentions: z + .array(z.string()) + .nullable() + .describe('User mentions in the reel'), + transcript: z + .string() + .nullable() + .describe( + 'Auto-generated speech transcript — only populated when includeTranscript=true' + ), + musicArtist: z.string().nullable().describe('Music artist name'), + musicTitle: z.string().nullable().describe('Music/song title'), + displayUrl: z.string().nullable().describe('Display thumbnail URL'), +}); + // Gemini-compatible single object schema with optional fields const InstagramToolParamsSchema = z.object({ operation: z - .enum(['scrapeProfile', 'scrapeHashtag']) + .enum(['scrapeProfile', 'scrapeHashtag', 'scrapeReels']) .describe( - 'Operation to perform: scrapeProfile for user profiles, scrapeHashtag for hashtag posts' + 'Operation to perform: scrapeProfile for user profiles, scrapeHashtag for hashtag posts, scrapeReels for reels (with optional transcript)' ), // Profile scraping fields (optional) @@ -56,6 +95,50 @@ const InstagramToolParamsSchema = z.object({ 'Hashtags to scrape (for scrapeHashtag operation). Examples: ["ai", "tech"] or ["https://www.instagram.com/explore/tags/ai"]' ), + // Reel scraping fields (optional) + targets: z + .array(z.string()) + .optional() + .describe( + 'Instagram usernames, profile URLs, profile IDs, or direct reel URLs (for scrapeReels operation). You can mix forms in one array — e.g., ["ryanbailey.cb", "https://www.instagram.com/p/DXIlvPbj2PY/"] pulls reels from a profile AND scrapes a specific reel in one call. Other valid examples: ["ryanbailey.cb"], ["https://www.instagram.com/ryanbailey.cb/"], ["https://www.instagram.com/p/DXIlvPbj2PY/"]' + ), + includeTranscript: z + .boolean() + .optional() + .describe( + 'For scrapeReels: extract auto-generated speech transcripts of each reel (paid add-on)' + ), + includeSharesCount: z + .boolean() + .optional() + .describe( + 'For scrapeReels: extract number of shares for each reel (paid add-on)' + ), + includeDownloadedVideo: z + .boolean() + .optional() + .describe( + 'For scrapeReels: include a direct MP4 download URL (a string URL — NOT video bytes — populated in the `downloadedVideo` field of each reel) for each reel. Paid add-on.' + ), + skipPinnedPosts: z + .boolean() + .optional() + .describe('For scrapeReels: exclude pinned reels from results'), + onlyPostsNewerThan: z + .string() + .optional() + .describe( + 'For scrapeReels: only return reels posted on or after this date. Accepts YYYY-MM-DD, ISO timestamp, or relative like "1 day" / "2 weeks"' + ), + timeoutSecs: z + .number() + .min(60) + .max(1800) + .optional() + .describe( + 'For scrapeReels: max seconds to wait for the Apify actor (default 600). Bump if you enable includeTranscript on >2 reels — transcript generation adds ~1-2 min per reel.' + ), + // Common fields limit: z .number() @@ -64,7 +147,7 @@ const InstagramToolParamsSchema = z.object({ .default(20) .optional() .describe( - 'Maximum number of posts to fetch (default: 20 for profiles, 50 for hashtags)' + 'Maximum number of items to fetch (default: 20 for profiles/reels, 50 for hashtags)' ), credentials: z @@ -76,13 +159,24 @@ const InstagramToolParamsSchema = z.object({ // Gemini-compatible single result schema const InstagramToolResultSchema = z.object({ operation: z - .enum(['scrapeProfile', 'scrapeHashtag']) + .enum(['scrapeProfile', 'scrapeHashtag', 'scrapeReels']) .describe('Operation that was performed'), - // Posts data (always present) + // Posts data (only for scrapeProfile / scrapeHashtag) posts: z .array(InstagramPostSchema) - .describe('Array of Instagram posts scraped'), + .optional() + .describe( + 'Array of Instagram posts scraped (only for scrapeProfile / scrapeHashtag operations — for scrapeReels see the `reels` field)' + ), + + // Reels data (only for scrapeReels operation) + reels: z + .array(InstagramReelSchema) + .optional() + .describe( + 'Reels with reel-specific fields like transcript, video URL, share count (only for scrapeReels operation)' + ), // Profile data (only for scrapeProfile operation) profiles: z @@ -108,8 +202,16 @@ const InstagramToolResultSchema = z.object({ 'List of profile usernames that were scraped (only for scrapeProfile operation)' ), + // Target data (only for scrapeReels operation) + scrapedTargets: z + .array(z.string()) + .optional() + .describe( + 'List of inputs that were scraped (only for scrapeReels operation)' + ), + // Common fields - totalPosts: z.number().describe('Total number of posts scraped'), + totalPosts: z.number().describe('Total number of posts/reels scraped'), success: z.boolean().describe('Whether the operation was successful'), error: z.string().describe('Error message if operation failed'), }); @@ -120,6 +222,7 @@ type InstagramToolResult = z.output; type InstagramToolParamsInput = z.input; export type InstagramPost = z.output; export type InstagramProfile = z.output; +export type InstagramReel = z.output; // Helper type to get the result type for a specific operation export type InstagramOperationResult< @@ -157,11 +260,18 @@ export class InstagramTool extends ToolBubble< - Get profile information (bio, followers, verified status) - Fetch recent posts from specific users - Track influencer or brand accounts - + 2. **scrapeHashtag**: Scrape posts by hashtag - Find trending content by hashtag - Monitor brand mentions and campaigns - Research hashtag performance + + 3. **scrapeReels**: Scrape reels from a profile or directly from reel URLs + - Get reels with video URLs, view/play counts, hashtags, music info + - Optionally extract auto-generated speech transcripts (paid add-on) + - Optionally extract share counts and direct MP4 download URLs (paid add-ons) + - Filter by date with onlyPostsNewerThan, skip pinned reels with skipPinnedPosts + - **Latency note**: transcript generation adds ~1-2 min per reel. Default timeout is 10 min (600s); for >5 reels with includeTranscript, bump via the timeoutSecs param. **WHEN TO USE THIS TOOL:** - **Any Instagram scraping task** - profiles, posts, hashtags, engagement data @@ -245,12 +355,28 @@ export class InstagramTool extends ToolBubble< ); } + if ( + operation === 'scrapeReels' && + (!this.params.targets || this.params.targets.length === 0) + ) { + if (this.params.profiles && this.params.profiles.length > 0) { + return this.createErrorResult( + 'scrapeReels uses the `targets` field, not `profiles`. Move your input to `targets` — it accepts usernames, profile URLs, profile IDs, or direct reel URLs.' + ); + } + return this.createErrorResult( + 'Targets array is required for scrapeReels operation. Pass usernames, profile URLs, or direct reel URLs in the `targets` field.' + ); + } + const result = await (async (): Promise => { switch (operation) { case 'scrapeProfile': return await this.handleScrapeProfile(this.params); case 'scrapeHashtag': return await this.handleScrapeHashtag(this.params); + case 'scrapeReels': + return await this.handleScrapeReels(this.params); default: throw new Error(`Unsupported operation: ${operation}`); } @@ -272,10 +398,12 @@ export class InstagramTool extends ToolBubble< return { operation: operation || 'scrapeProfile', - posts: [], + posts: operation === 'scrapeReels' ? undefined : [], + reels: operation === 'scrapeReels' ? [] : undefined, profiles: operation === 'scrapeProfile' ? [] : undefined, scrapedProfiles: operation === 'scrapeProfile' ? [] : undefined, scrapedHashtags: operation === 'scrapeHashtag' ? [] : undefined, + scrapedTargets: operation === 'scrapeReels' ? [] : undefined, totalPosts: 0, success: false, error: errorMessage, @@ -331,6 +459,65 @@ export class InstagramTool extends ToolBubble< }; } + /** + * Handle scrapeReels operation + */ + private async handleScrapeReels( + params: InstagramToolParams + ): Promise { + const targets = params.targets!; + + const scrape_reels_apify = new ApifyBubble<'apify/instagram-reel-scraper'>( + { + actorId: 'apify/instagram-reel-scraper', + input: { + username: targets, + resultsLimit: params.limit || 20, + skipPinnedPosts: params.skipPinnedPosts ?? false, + includeSharesCount: params.includeSharesCount ?? false, + includeTranscript: params.includeTranscript ?? false, + includeDownloadedVideo: params.includeDownloadedVideo ?? false, + ...(params.onlyPostsNewerThan + ? { onlyPostsNewerThan: params.onlyPostsNewerThan } + : {}), + }, + waitForFinish: true, + timeout: (params.timeoutSecs ?? 600) * 1000, + credentials: params.credentials, + limit: params.limit || 20, + }, + this.context, + 'scrape_reels_apify' + ); + + const apifyResult = await scrape_reels_apify.action(); + + if (!apifyResult.data.success) { + return { + operation: 'scrapeReels', + reels: [], + scrapedTargets: targets, + totalPosts: 0, + success: false, + error: + apifyResult.data.error || + 'Failed to scrape Instagram reels. Please try again.', + }; + } + + const items = apifyResult.data.items || []; + const reels = this.extractReels(items); + + return { + operation: 'scrapeReels', + reels, + scrapedTargets: targets, + totalPosts: reels.length, + success: true, + error: '', + }; + } + /** * Scrape hashtags using Apify service * This is the current implementation - future versions could add other services @@ -600,6 +787,51 @@ export class InstagramTool extends ToolBubble< return profiles; } + /** + * Extract reels from reel scraper results + * Reel scraper returns reels directly (not nested under profiles) + */ + private extractReels( + items: ActorOutput<'apify/instagram-reel-scraper'>[] + ): InstagramReel[] { + const reels: InstagramReel[] = []; + + for (const item of items) { + if (typeof item !== 'object' || item === null) continue; + + const anyItem = item as Record; + const music = (anyItem.musicInfo ?? null) as Record< + string, + unknown + > | null; + + reels.push({ + url: (anyItem.url as string) || null, + shortCode: (anyItem.shortCode as string) || null, + caption: (anyItem.caption as string) || null, + ownerUsername: (anyItem.ownerUsername as string) || null, + ownerFullName: (anyItem.ownerFullName as string) || null, + timestamp: (anyItem.timestamp as string) || null, + videoUrl: (anyItem.videoUrl as string) || null, + downloadedVideo: (anyItem.downloadedVideo as string) || null, + videoDuration: (anyItem.videoDuration as number) ?? null, + videoViewCount: (anyItem.videoViewCount as number) ?? null, + videoPlayCount: (anyItem.videoPlayCount as number) ?? null, + likesCount: (anyItem.likesCount as number) ?? null, + commentsCount: (anyItem.commentsCount as number) ?? null, + sharesCount: (anyItem.sharesCount as number) ?? null, + hashtags: (anyItem.hashtags as string[]) || null, + mentions: (anyItem.mentions as string[]) || null, + transcript: (anyItem.transcript as string) || null, + musicArtist: (music?.artist_name as string) || null, + musicTitle: (music?.song_name as string) || null, + displayUrl: (anyItem.displayUrl as string) || null, + }); + } + + return reels; + } + /** * Extract posts from hashtag scraper results * Hashtag scraper returns posts directly (not nested) diff --git a/packages/bubble-runtime/package.json b/packages/bubble-runtime/package.json index 1db40507..a505f34c 100644 --- a/packages/bubble-runtime/package.json +++ b/packages/bubble-runtime/package.json @@ -1,6 +1,6 @@ { "name": "@bubblelab/bubble-runtime", - "version": "0.1.321", + "version": "0.1.322", "type": "module", "license": "Apache-2.0", "main": "./dist/index.js", diff --git a/packages/bubble-scope-manager/package.json b/packages/bubble-scope-manager/package.json index d0c0d5ca..23375a2a 100644 --- a/packages/bubble-scope-manager/package.json +++ b/packages/bubble-scope-manager/package.json @@ -1,6 +1,6 @@ { "name": "@bubblelab/ts-scope-manager", - "version": "0.1.321", + "version": "0.1.322", "private": false, "license": "MIT", "type": "commonjs", diff --git a/packages/bubble-shared-schemas/package.json b/packages/bubble-shared-schemas/package.json index ede2ab24..d016ee08 100644 --- a/packages/bubble-shared-schemas/package.json +++ b/packages/bubble-shared-schemas/package.json @@ -1,6 +1,6 @@ { "name": "@bubblelab/shared-schemas", - "version": "0.1.321", + "version": "0.1.322", "type": "module", "license": "Apache-2.0", "main": "./dist/index.js", diff --git a/packages/create-bubblelab-app/package.json b/packages/create-bubblelab-app/package.json index 489bc161..b6e21409 100644 --- a/packages/create-bubblelab-app/package.json +++ b/packages/create-bubblelab-app/package.json @@ -1,6 +1,6 @@ { "name": "create-bubblelab-app", - "version": "0.1.321", + "version": "0.1.322", "type": "module", "license": "Apache-2.0", "description": "Create BubbleLab AI agent applications with one command", diff --git a/packages/create-bubblelab-app/templates/basic/package.json b/packages/create-bubblelab-app/templates/basic/package.json index e2cd7c02..8ef60f2b 100644 --- a/packages/create-bubblelab-app/templates/basic/package.json +++ b/packages/create-bubblelab-app/templates/basic/package.json @@ -11,9 +11,9 @@ "typecheck": "tsc --noEmit" }, "dependencies": { - "@bubblelab/bubble-core": "^0.1.321", - "@bubblelab/bubble-runtime": "^0.1.321", - "@bubblelab/shared-schemas": "^0.1.321", + "@bubblelab/bubble-core": "^0.1.322", + "@bubblelab/bubble-runtime": "^0.1.322", + "@bubblelab/shared-schemas": "^0.1.322", "dotenv": "^16.4.5" }, "devDependencies": { diff --git a/packages/create-bubblelab-app/templates/reddit-scraper/package.json b/packages/create-bubblelab-app/templates/reddit-scraper/package.json index b1804e35..2080c7aa 100644 --- a/packages/create-bubblelab-app/templates/reddit-scraper/package.json +++ b/packages/create-bubblelab-app/templates/reddit-scraper/package.json @@ -11,8 +11,8 @@ "typecheck": "tsc --noEmit" }, "dependencies": { - "@bubblelab/bubble-core": "^0.1.321", - "@bubblelab/bubble-runtime": "^0.1.321", + "@bubblelab/bubble-core": "^0.1.322", + "@bubblelab/bubble-runtime": "^0.1.322", "dotenv": "^16.4.5" }, "devDependencies": {