diff --git a/.changeset/feat-groq-ai-transcription.md b/.changeset/feat-groq-ai-transcription.md new file mode 100644 index 000000000..d2bea4d5c --- /dev/null +++ b/.changeset/feat-groq-ai-transcription.md @@ -0,0 +1,7 @@ +--- +'@tanstack/ai-groq': minor +--- + +Adds Groq as a transcription provider. Groq's API is mostly OpenAI SDK-compatible, +but its transcription endpoint additionally accepts HTTP URLs as input, so this +is implemented as a custom integration rather than going through the SDK. diff --git a/docs/adapters/groq.md b/docs/adapters/groq.md index 1c4c81644..4fc1e6b25 100644 --- a/docs/adapters/groq.md +++ b/docs/adapters/groq.md @@ -2,7 +2,7 @@ title: Groq id: groq-adapter order: 6 -description: "Use Groq's fast inference API with TanStack AI for low-latency LLM responses — Llama and other open-weight models via @tanstack/ai-groq." +description: "Use Groq's fast inference API with TanStack AI for low-latency LLM responses and Whisper transcription — Llama and other open-weight models via @tanstack/ai-groq." keywords: - tanstack ai - groq @@ -11,9 +11,11 @@ keywords: - low latency - adapter - llm + - whisper + - transcription --- -The Groq adapter provides access to Groq's fast inference API, featuring the world's fastest LLM inference. +The Groq adapter provides access to Groq's fast inference API, featuring the world's fastest LLM inference and Whisper-based audio transcription. ## Installation @@ -108,6 +110,32 @@ const stream = chat({ }); ``` +## Transcription + +Groq exposes Whisper-based speech-to-text via `groqTranscription()` and the `generateTranscription()` activity. The `audio` input accepts a `File`, `Blob`, `ArrayBuffer`, base64 string, data URL, or an `https://` URL (forwarded directly to Groq without re-uploading). + +```typescript +import { generateTranscription } from "@tanstack/ai"; +import { groqTranscription } from "@tanstack/ai-groq"; + +const result = await generateTranscription({ + adapter: groqTranscription("whisper-large-v3-turbo"), + audio: "https://example.com/recording.mp3", + language: "en", +}); + +console.log(result.text); + +// verbose_json (the default) populates language, duration, and timestamped segments +for (const segment of result.segments ?? []) { + console.log(`[${segment.start}s → ${segment.end}s] ${segment.text}`); +} +``` + +Supported models: `whisper-large-v3-turbo`, `whisper-large-v3`. Supported `responseFormat` values: `json`, `text`, `verbose_json` (default). `srt` and `vtt` are not supported by Groq. + +See [Transcription](../media/transcription) for the full API. + ## Model Options Groq supports various provider-specific options: @@ -197,11 +225,14 @@ Creates a Groq chat adapter with an explicit API key. **Returns:** A Groq chat adapter instance. +### `groqTranscription(model, config?)` / `createGroqTranscription(model, apiKey, config?)` + +Creates a Groq transcription (speech-to-text) adapter. The short form reads `GROQ_API_KEY` from the environment; the `create*` form takes an explicit API key. Supported models: `whisper-large-v3-turbo`, `whisper-large-v3`. + ## Limitations - **Text-to-Speech**: Groq does not currently expose a TTS adapter. Use OpenAI, Gemini, ElevenLabs, or fal for speech generation. - **Image Generation**: Groq does not support image generation. Use OpenAI, Gemini, or fal for image generation. -- **Transcription**: Groq does not currently expose a transcription adapter through TanStack AI. ## Next Steps diff --git a/docs/media/transcription.md b/docs/media/transcription.md index eaf64dfad..abc767517 100644 --- a/docs/media/transcription.md +++ b/docs/media/transcription.md @@ -2,7 +2,7 @@ title: Transcription id: transcription order: 4 -description: "Transcribe audio to text with OpenAI Whisper and GPT-4o-transcribe via TanStack AI's generateTranscription() API." +description: "Transcribe audio to text with OpenAI Whisper, GPT-4o-transcribe, Groq Whisper, and fal.ai STT models via TanStack AI's generateTranscription() API." keywords: - tanstack ai - transcription @@ -11,11 +11,13 @@ keywords: - whisper - generateTranscription - openai + - groq + - fal --- # Audio Transcription -TanStack AI provides support for audio transcription (speech-to-text) through dedicated transcription adapters. This guide covers how to convert spoken audio into text using OpenAI's Whisper and GPT-4o transcription models. +TanStack AI provides support for audio transcription (speech-to-text) through dedicated transcription adapters. This guide covers how to convert spoken audio into text using OpenAI's Whisper and GPT-4o transcription models, Groq's hosted Whisper models, and fal.ai STT models. ## Overview @@ -23,6 +25,7 @@ Audio transcription is handled by transcription adapters that follow the same tr Currently supported: - **OpenAI**: Whisper-1, GPT-4o-transcribe, GPT-4o-mini-transcribe +- **Groq**: whisper-large-v3-turbo, whisper-large-v3 - **fal.ai**: Whisper, Wizper, speech-to-text turbo, ElevenLabs speech-to-text ## Basic Usage @@ -76,6 +79,31 @@ const result = await generateTranscription({ }) ``` +### Groq Transcription + +Groq hosts Whisper large-v3 and large-v3-turbo on its fast inference stack. The `audio` input accepts a `File`, `Blob`, `ArrayBuffer`, base64 string, data URL, or an `https://` URL (which is forwarded to Groq without re-uploading). + +```typescript +import { generateTranscription } from '@tanstack/ai' +import { groqTranscription } from '@tanstack/ai-groq' + +const result = await generateTranscription({ + adapter: groqTranscription('whisper-large-v3-turbo'), + audio: 'https://example.com/recording.mp3', + language: 'en', +}) + +console.log(result.text) +console.log(result.language) + +// verbose_json is the default — segments carry segment-level start/end timestamps +for (const segment of result.segments ?? []) { + console.log(`[${segment.start}s → ${segment.end}s] ${segment.text}`) +} +``` + +> **Note:** Groq supports `responseFormat` values `json`, `text`, and `verbose_json` (default). `srt` and `vtt` are not supported — passing them throws. Provider-specific `modelOptions` are `temperature` and `timestamp_granularities` (`['word']`, `['segment']`, or both). + ### fal.ai Transcription fal.ai offers Whisper, Wizper, and other STT models. The `audio` input accepts a URL, `File`, `Blob`, or `ArrayBuffer` (auto-wrapped in a `Blob`). @@ -171,16 +199,18 @@ interface TranscriptionResult { text: string // Full transcribed text language?: string // Detected/specified language duration?: number // Audio duration in seconds - segments?: Array<{ // Timestamped segments + segments?: Array<{ // Segment-level timestamps + id: number // Segment identifier start: number // Start time in seconds end: number // End time in seconds text: string // Segment text - words?: Array<{ // Word-level timestamps - word: string - start: number - end: number - confidence?: number - }> + confidence?: number // Confidence score (0-1) + speaker?: string // Speaker identifier, if diarization is enabled + }> + words?: Array<{ // Word-level timestamps + word: string + start: number + end: number }> } ``` diff --git a/packages/ai-groq/src/adapters/transcription.ts b/packages/ai-groq/src/adapters/transcription.ts new file mode 100644 index 000000000..356dcf611 --- /dev/null +++ b/packages/ai-groq/src/adapters/transcription.ts @@ -0,0 +1,302 @@ +import { BaseTranscriptionAdapter } from '@tanstack/ai/adapters' +import { base64ToArrayBuffer, generateId } from '@tanstack/ai-utils' +import { getGroqApiKeyFromEnv, withGroqDefaults } from '../utils/client' +import type { + TranscriptionOptions, + TranscriptionResult, + TranscriptionSegment, +} from '@tanstack/ai' +import type { GroqTranscriptionModel } from '../model-meta' +import type { GroqTranscriptionProviderOptions } from '../audio/transcription-provider-options' +import type { GroqClientConfig } from '../utils/client' + +/** + * Configuration for the Groq Transcription adapter. + */ +export interface GroqTranscriptionConfig extends GroqClientConfig {} + +// Shape of Groq's verbose_json transcription response +interface GroqVerboseTranscriptionResponse { + task?: string + language?: string + duration?: number + text: string + segments?: Array<{ + id: number + seek?: number + start: number + end: number + text: string + tokens?: Array + temperature?: number + avg_logprob: number + compression_ratio?: number + no_speech_prob?: number + }> + words?: Array<{ word: string; start: number; end: number }> + x_groq?: { id?: string } +} + +// Shape of Groq's json transcription response +interface GroqJsonTranscriptionResponse { + text: string + x_groq?: { id?: string } +} + +/** + * Groq Transcription (Speech-to-Text) Adapter + * + * Tree-shakeable adapter for Groq audio transcription. Supports + * whisper-large-v3 and whisper-large-v3-turbo. + * + * Features: + * - Audio file uploads (File, Blob, ArrayBuffer, base64/data URL) + * - Remote audio URLs passed directly via Groq's `url` field — no upload needed + * - Verbose JSON response with segment and word timestamps + * - Language detection or specification (ISO-639-1) + * - Confidence scores derived from segment avg_logprob + */ +export class GroqTranscriptionAdapter< + TModel extends GroqTranscriptionModel, +> extends BaseTranscriptionAdapter { + readonly name = 'groq' as const + + private readonly apiKey: string + private readonly baseURL: string + + constructor(config: GroqTranscriptionConfig, model: TModel) { + super(model, {}) + const resolved = withGroqDefaults(config) + this.apiKey = resolved.apiKey + this.baseURL = resolved.baseURL ?? 'https://api.groq.com/openai/v1' + } + + async transcribe( + options: TranscriptionOptions, + ): Promise { + const { model, audio, language, prompt, responseFormat, modelOptions } = + options + + // Groq's transcription endpoint only accepts 'json', 'text', and + // 'verbose_json'. Reject 'srt'/'vtt' up front so callers get a clear + // message instead of an opaque Groq HTTP error. + if (responseFormat === 'srt' || responseFormat === 'vtt') { + throw new Error( + `Groq transcription does not support responseFormat='${responseFormat}'. ` + + `Supported values: 'json', 'text', 'verbose_json'.`, + ) + } + + // Default to verbose_json so callers get language, duration, and timestamps + // without having to opt in explicitly. Both Groq whisper models support it. + const effectiveFormat = responseFormat ?? 'verbose_json' + const useVerbose = effectiveFormat === 'verbose_json' + + const form = new FormData() + form.append('model', model) + form.append('response_format', effectiveFormat) + if (language !== undefined) form.append('language', language) + if (prompt !== undefined) form.append('prompt', prompt) + if (modelOptions?.temperature !== undefined) { + form.append('temperature', String(modelOptions.temperature)) + } + if (modelOptions?.timestamp_granularities !== undefined) { + for (const g of modelOptions.timestamp_granularities) { + form.append('timestamp_granularities[]', g) + } + } + + // HTTP/HTTPS URLs are forwarded directly via Groq's `url` field, which + // avoids a round-trip upload. All other inputs (File, Blob, ArrayBuffer, + // base64, data URL) are converted to a File and sent as `file`. + if (typeof audio === 'string' && /^https?:\/\//.test(audio)) { + form.append('url', audio) + } else { + form.append('file', this.prepareAudioFile(audio)) + } + + try { + options.logger.request( + `activity=transcription provider=${this.name} model=${model} verbose=${useVerbose}`, + { provider: this.name, model }, + ) + + const response = await fetch(`${this.baseURL}/audio/transcriptions`, { + method: 'POST', + headers: { Authorization: `Bearer ${this.apiKey}` }, + body: form, + }) + + if (!response.ok) { + const body = await response + .json() + .catch(() => null as Record | null) + const message = + (body?.error as { message?: string } | undefined)?.message ?? + `Groq API error ${response.status}` + throw new Error(message) + } + + if (useVerbose) { + const data = (await response.json()) as GroqVerboseTranscriptionResponse + const requestId = data.x_groq?.id ?? generateId(this.name) + + // `TranscriptionResult` declares optional fields without `| undefined`, + // so under exactOptionalPropertyTypes we must omit absent fields rather + // than assigning `undefined`. + const segments = data.segments?.map( + (seg): TranscriptionSegment => ({ + id: seg.id, + start: seg.start, + end: seg.end, + text: seg.text, + confidence: Math.exp(seg.avg_logprob), + }), + ) + const words = data.words?.map((w) => ({ + word: w.word, + start: w.start, + end: w.end, + })) + + return { + id: requestId, + model, + text: data.text, + ...(data.language !== undefined && { language: data.language }), + ...(data.duration !== undefined && { duration: data.duration }), + ...(segments !== undefined && { segments }), + ...(words !== undefined && { words }), + } + } else if (effectiveFormat === 'text') { + const text = await response.text() + return { + id: generateId(this.name), + model, + text, + ...(language !== undefined && { language }), + } + } else { + const data = (await response.json()) as GroqJsonTranscriptionResponse + return { + id: data.x_groq?.id ?? generateId(this.name), + model, + text: data.text, + ...(language !== undefined && { language }), + } + } + } catch (error: unknown) { + options.logger.errors(`${this.name}.transcribe fatal`, { + error, + source: `${this.name}.transcribe`, + }) + throw error + } + } + + private prepareAudioFile(audio: string | File | Blob | ArrayBuffer): File { + if (typeof File !== 'undefined' && audio instanceof File) { + return audio + } + if (typeof Blob !== 'undefined' && audio instanceof Blob) { + this.ensureFileSupport() + return new File([audio], 'audio.mp3', { + type: audio.type || 'audio/mpeg', + }) + } + if (typeof ArrayBuffer !== 'undefined' && audio instanceof ArrayBuffer) { + this.ensureFileSupport() + return new File([audio], 'audio.mp3', { type: 'audio/mpeg' }) + } + if (typeof audio === 'string') { + this.ensureFileSupport() + + if (audio.startsWith('data:')) { + const parts = audio.split(',') + const header = parts[0] + const base64Data = parts[1] || '' + const mimeMatch = header?.match(/data:([^;]+)/) + const mimeType = mimeMatch?.[1] || 'audio/mpeg' + const bytes = base64ToArrayBuffer(base64Data) + const extension = mimeType.split('/')[1] || 'mp3' + return new File([bytes], `audio.${extension}`, { type: mimeType }) + } + + const bytes = base64ToArrayBuffer(audio) + return new File([bytes], 'audio.mp3', { type: 'audio/mpeg' }) + } + + throw new Error('Invalid audio input type') + } + + // Throws on Node < 20 where the global `File` constructor is unavailable. + private ensureFileSupport(): void { + if (typeof File === 'undefined') { + throw new Error( + '`File` is not available in this environment. ' + + 'Use Node.js 20 or newer, or pass a File object directly.', + ) + } + } +} + +/** + * Creates a Groq transcription adapter with an explicit API key. + * Type resolution happens here at the call site. + * + * @param model - The model name (e.g., 'whisper-large-v3-turbo') + * @param apiKey - Your Groq API key + * @param config - Optional additional configuration + * @returns Configured Groq transcription adapter instance + * + * @example + * ```typescript + * const adapter = createGroqTranscription('whisper-large-v3-turbo', 'gsk_...'); + * + * const result = await generateTranscription({ + * adapter, + * audio: audioFile, + * language: 'en', + * }); + * ``` + */ +export function createGroqTranscription( + model: TModel, + apiKey: string, + config?: Omit, +): GroqTranscriptionAdapter { + return new GroqTranscriptionAdapter({ apiKey, ...config }, model) +} + +/** + * Creates a Groq transcription adapter using the `GROQ_API_KEY` environment + * variable. Type resolution happens here at the call site. + * + * Looks for `GROQ_API_KEY` in: + * - `process.env` (Node.js) + * - `window.env` (browser with injected env) + * + * @param model - The model name (e.g., 'whisper-large-v3-turbo') + * @param config - Optional configuration (excluding apiKey which is auto-detected) + * @returns Configured Groq transcription adapter instance + * @throws Error if GROQ_API_KEY is not found in environment + * + * @example + * ```typescript + * const adapter = groqTranscription('whisper-large-v3-turbo'); + * + * const result = await generateTranscription({ + * adapter, + * audio: 'https://example.com/audio.mp3', + * }); + * + * console.log(result.text) + * ``` + */ +export function groqTranscription( + model: TModel, + config?: Omit, +): GroqTranscriptionAdapter { + const apiKey = getGroqApiKeyFromEnv() + return createGroqTranscription(model, apiKey, config) +} diff --git a/packages/ai-groq/src/audio/transcription-provider-options.ts b/packages/ai-groq/src/audio/transcription-provider-options.ts new file mode 100644 index 000000000..ddd1dde3d --- /dev/null +++ b/packages/ai-groq/src/audio/transcription-provider-options.ts @@ -0,0 +1,20 @@ +/** + * Groq-specific options for audio transcription. + * + * These fields extend the shared `TranscriptionOptions` and are forwarded + * verbatim to the Groq transcription endpoint. + */ +export interface GroqTranscriptionProviderOptions { + /** + * Sampling temperature between 0 and 1. Lower values produce more + * deterministic output. Groq recommends 0 (the default) for most use cases. + */ + temperature?: number + + /** + * Granularity levels to include when `response_format` is `verbose_json`. + * Pass `['word']`, `['segment']`, or both to control which timestamp arrays + * appear in the result. + */ + timestamp_granularities?: Array<'word' | 'segment'> +} diff --git a/packages/ai-groq/src/index.ts b/packages/ai-groq/src/index.ts index 034ff38d0..b441fd880 100644 --- a/packages/ai-groq/src/index.ts +++ b/packages/ai-groq/src/index.ts @@ -14,6 +14,15 @@ export { type GroqTextProviderOptions, } from './adapters/text' +// Transcription adapter +export { + GroqTranscriptionAdapter, + createGroqTranscription, + groqTranscription, + type GroqTranscriptionConfig, +} from './adapters/transcription' +export type { GroqTranscriptionProviderOptions } from './audio/transcription-provider-options' + // Types export type { GroqChatModelProviderOptionsByName, @@ -22,8 +31,9 @@ export type { ResolveProviderOptions, ResolveInputModalities, GroqChatModels, + GroqTranscriptionModel, } from './model-meta' -export { GROQ_CHAT_MODELS } from './model-meta' +export { GROQ_CHAT_MODELS, GROQ_TRANSCRIPTION_MODELS } from './model-meta' export type { GroqTextMetadata, GroqImageMetadata, diff --git a/packages/ai-groq/src/model-meta.ts b/packages/ai-groq/src/model-meta.ts index 70eae1cde..0b711f905 100644 --- a/packages/ai-groq/src/model-meta.ts +++ b/packages/ai-groq/src/model-meta.ts @@ -402,3 +402,16 @@ export type ResolveInputModalities = TModel extends keyof GroqModelInputModalitiesByName ? GroqModelInputModalitiesByName[TModel] : readonly ['text'] + +/** + * All supported Groq transcription model identifiers. + */ +export const GROQ_TRANSCRIPTION_MODELS = [ + 'whisper-large-v3-turbo', + 'whisper-large-v3', +] as const + +/** + * Union type of all supported Groq transcription model names. + */ +export type GroqTranscriptionModel = (typeof GROQ_TRANSCRIPTION_MODELS)[number] diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts index 49aa74708..5854383a2 100644 --- a/testing/e2e/src/lib/feature-support.ts +++ b/testing/e2e/src/lib/feature-support.ts @@ -165,7 +165,7 @@ export const matrix: Record> = { 'audio-gen': new Set(['gemini', 'elevenlabs']), 'sound-effects': new Set(['elevenlabs']), tts: new Set(['openai', 'grok', 'elevenlabs']), - transcription: new Set(['openai', 'grok', 'elevenlabs']), + transcription: new Set(['openai', 'grok', 'groq', 'elevenlabs']), 'video-gen': new Set(['openai']), // Only Gemini currently surfaces a first-class stateful conversation API via // the adapter (geminiTextInteractions, behind @tanstack/ai-gemini/experimental). diff --git a/testing/e2e/src/lib/media-providers.ts b/testing/e2e/src/lib/media-providers.ts index 2fb534291..07be9f5fe 100644 --- a/testing/e2e/src/lib/media-providers.ts +++ b/testing/e2e/src/lib/media-providers.ts @@ -10,6 +10,7 @@ import { createGrokSpeech, createGrokTranscription, } from '@tanstack/ai-grok' +import { createGroqTranscription } from '@tanstack/ai-groq' import { createElevenLabsAudio, createElevenLabsSpeech, @@ -109,6 +110,11 @@ export function createTranscriptionAdapter( baseURL: openaiUrl(aimockPort), defaultHeaders: headers, }), + groq: () => + createGroqTranscription('whisper-large-v3-turbo', DUMMY_KEY, { + baseURL: openaiUrl(aimockPort), + defaultHeaders: headers, + }), elevenlabs: () => createElevenLabsTranscription('scribe_v1', DUMMY_KEY, { baseUrl: llmockBase(aimockPort),