From a59d368bd48d204c04f7f5a132ab85c40b6e9fb3 Mon Sep 17 00:00:00 2001 From: 8times4 <46720448+8times4@users.noreply.github.com> Date: Wed, 27 May 2026 16:46:26 +0200 Subject: [PATCH 1/2] add diarization support --- .../openai-transcription-diarization.md | 7 + docs/adapters/openai.md | 32 +- docs/comparison/vercel-ai-sdk.md | 2 +- docs/media/generation-hooks.md | 2 +- docs/media/transcription.md | 66 +++- .../interfaces/TranscriptionOptions.md | 2 +- packages/ai-client/src/generation-types.ts | 8 +- .../ai-openai/src/adapters/transcription.ts | 163 +++++++++- .../audio/transcription-provider-options.ts | 10 + .../tests/transcription-adapter.test.ts | 300 ++++++++++++++++++ .../skills/ai-core/media-generation/SKILL.md | 8 +- .../activities/generateTranscription/index.ts | 8 +- packages/ai/src/types.ts | 8 +- 13 files changed, 574 insertions(+), 42 deletions(-) create mode 100644 .changeset/openai-transcription-diarization.md create mode 100644 packages/ai-openai/tests/transcription-adapter.test.ts diff --git a/.changeset/openai-transcription-diarization.md b/.changeset/openai-transcription-diarization.md new file mode 100644 index 000000000..67769277c --- /dev/null +++ b/.changeset/openai-transcription-diarization.md @@ -0,0 +1,7 @@ +--- +'@tanstack/ai': minor +'@tanstack/ai-client': minor +'@tanstack/ai-openai': minor +--- + +Add OpenAI transcription diarization support with `diarized_json` output, speaker-labeled segments, diarization model validation, chunking strategy options, and docs. diff --git a/docs/adapters/openai.md b/docs/adapters/openai.md index e780a9a0e..a59a68839 100644 --- a/docs/adapters/openai.md +++ b/docs/adapters/openai.md @@ -294,10 +294,11 @@ console.log(result.text); // Transcribed text const result = await generateTranscription({ adapter: openaiTranscription("whisper-1"), audio: audioFile, + responseFormat: "verbose_json", + prompt: "Technical terms: API, SDK", modelOptions: { - response_format: "verbose_json", // Get timestamps temperature: 0, - prompt: "Technical terms: API, SDK", + timestamp_granularities: ["word", "segment"], }, }); @@ -305,6 +306,31 @@ const result = await generateTranscription({ console.log(result.segments); ``` +### Speaker Diarization + +Use `gpt-4o-transcribe-diarize` for speaker-labeled transcripts: + +```typescript +const result = await generateTranscription({ + adapter: openaiTranscription("gpt-4o-transcribe-diarize"), + audio: meetingAudioFile, + modelOptions: { + chunking_strategy: "auto", + known_speaker_names: ["agent", "customer"], + known_speaker_references: [ + "data:audio/wav;base64,...", + "data:audio/wav;base64,...", + ], + }, +}); + +for (const segment of result.segments ?? []) { + console.log(segment.speaker, segment.start, segment.end, segment.text); +} +``` + +`gpt-4o-transcribe-diarize` defaults to `responseFormat: "diarized_json"` and `chunking_strategy: "auto"`. OpenAI does not support `prompt`, `include`, or `timestamp_granularities` with diarized transcription. + ## Environment Variables Set your API key in environment variables: @@ -353,7 +379,7 @@ Creates an OpenAI text-to-speech adapter. ### `openaiTranscription(model, config?)` / `createOpenaiTranscription(model, apiKey, config?)` -Creates an OpenAI transcription adapter (Whisper). +Creates an OpenAI transcription adapter for Whisper, GPT-4o transcription, and GPT-4o diarized transcription models. ### `openaiVideo(model, config?)` / `createOpenaiVideo(model, apiKey, config?)` diff --git a/docs/comparison/vercel-ai-sdk.md b/docs/comparison/vercel-ai-sdk.md index 415a4731c..eba344284 100644 --- a/docs/comparison/vercel-ai-sdk.md +++ b/docs/comparison/vercel-ai-sdk.md @@ -389,7 +389,7 @@ const result = await generateSpeech({ }) ``` -**Transcription** - `generateTranscription()` supports 5 output formats (json, text, srt, verbose_json, vtt), word-level timestamps with confidence scores, and speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model. +**Transcription** - `generateTranscription()` supports 6 output formats (json, text, srt, verbose_json, vtt, diarized_json), word-level timestamps with confidence scores, and speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model. ```ts import { generateTranscription } from '@tanstack/ai' diff --git a/docs/media/generation-hooks.md b/docs/media/generation-hooks.md index d9adadc6a..bc2ac69a0 100644 --- a/docs/media/generation-hooks.md +++ b/docs/media/generation-hooks.md @@ -214,7 +214,7 @@ The `generate` function accepts a `TranscriptionGenerateInput`: | `audio` | `string \| File \| Blob` | Audio data -- base64 string, File, or Blob (required) | | `language` | `string` | Language in ISO-639-1 format (e.g., `"en"`) | | `prompt` | `string` | Optional prompt to guide the transcription | -| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt'` | Output format | +| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt' \| 'diarized_json'` | Output format | | `modelOptions` | `Record` | Model-specific options | ## useSummarize diff --git a/docs/media/transcription.md b/docs/media/transcription.md index eaf64dfad..f15336c40 100644 --- a/docs/media/transcription.md +++ b/docs/media/transcription.md @@ -2,7 +2,7 @@ title: Transcription id: transcription order: 4 -description: "Transcribe audio to text with OpenAI Whisper and GPT-4o-transcribe via TanStack AI's generateTranscription() API." +description: "Transcribe audio to text with OpenAI Whisper and GPT-4o transcription models, including speaker diarization, via TanStack AI's generateTranscription() API." keywords: - tanstack ai - transcription @@ -22,7 +22,7 @@ TanStack AI provides support for audio transcription (speech-to-text) through de Audio transcription is handled by transcription adapters that follow the same tree-shakeable architecture as other adapters in TanStack AI. Currently supported: -- **OpenAI**: Whisper-1, GPT-4o-transcribe, GPT-4o-mini-transcribe +- **OpenAI**: Whisper-1, GPT-4o-transcribe, GPT-4o-mini-transcribe, GPT-4o-transcribe-diarize - **fal.ai**: Whisper, Wizper, speech-to-text turbo, ElevenLabs speech-to-text ## Basic Usage @@ -107,6 +107,8 @@ for (const segment of result.segments ?? []) { |--------|------|-------------| | `audio` | `File \| string` | Audio data (File object or base64 string) - required | | `language` | `string` | Language code (e.g., "en", "es", "fr") | +| `prompt` | `string` | Optional prompt to guide transcription style or terms. Not supported with `gpt-4o-transcribe-diarize`. | +| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt' \| 'diarized_json'` | Output format | ### Supported Languages @@ -135,20 +137,23 @@ Whisper supports many languages. Common codes include: const result = await generateTranscription({ adapter: openaiTranscription('whisper-1'), audio: audioFile, + responseFormat: 'verbose_json', + prompt: 'Technical terms: API, SDK, CLI', modelOptions: { - response_format: 'verbose_json', // Get detailed output with timestamps temperature: 0, // Lower = more deterministic - prompt: 'Technical terms: API, SDK, CLI', // Guide transcription + timestamp_granularities: ['word', 'segment'], }, }) ``` | Option | Type | Description | |--------|------|-------------| -| `response_format` | `string` | Output format: "json", "text", "srt", "verbose_json", "vtt" | | `temperature` | `number` | Sampling temperature (0 to 1) | -| `prompt` | `string` | Optional text to guide transcription style | -| `include` | `string[]` | Timestamp granularity: ["word"], ["segment"], or both | +| `include` | `string[]` | Additional response data such as `logprobs`; only available with `json` responses on supported GPT-4o transcription models | +| `timestamp_granularities` | `('word' \| 'segment')[]` | Timestamp detail for `whisper-1` with `responseFormat: 'verbose_json'` | +| `chunking_strategy` | `'auto' \| { type: 'server_vad', ... } \| null` | Audio chunking strategy for `gpt-4o-transcribe-diarize`; required by OpenAI for diarization inputs longer than 30 seconds | +| `known_speaker_names` | `string[]` | Up to four speaker labels for diarization | +| `known_speaker_references` | `string[]` | 2-10 second data URL audio samples matching `known_speaker_names` | ### Response Formats @@ -159,6 +164,32 @@ const result = await generateTranscription({ | `srt` | SubRip subtitle format | | `verbose_json` | Detailed JSON with timestamps and segments | | `vtt` | WebVTT subtitle format | +| `diarized_json` | JSON with speaker-labeled segments. Only supported by `gpt-4o-transcribe-diarize`. | + +### Speaker Diarization + +Use `gpt-4o-transcribe-diarize` when you need speaker labels. TanStack AI defaults this model to `responseFormat: 'diarized_json'` and sends `chunking_strategy: 'auto'` unless you provide a chunking strategy yourself. + +```typescript +const result = await generateTranscription({ + adapter: openaiTranscription('gpt-4o-transcribe-diarize'), + audio: meetingAudioFile, + modelOptions: { + chunking_strategy: 'auto', + known_speaker_names: ['agent', 'customer'], + known_speaker_references: [ + 'data:audio/wav;base64,...', + 'data:audio/wav;base64,...', + ], + }, +}) + +for (const segment of result.segments ?? []) { + console.log(segment.speaker, segment.start, segment.end, segment.text) +} +``` + +OpenAI accepts up to four known speaker references. The diarization model does not support `prompt`, `include`, or `timestamp_granularities`; the adapter rejects those combinations before making the API request. ## Response Format @@ -172,15 +203,17 @@ interface TranscriptionResult { language?: string // Detected/specified language duration?: number // Audio duration in seconds segments?: Array<{ // Timestamped segments + id: number start: number // Start time in seconds end: number // End time in seconds text: string // Segment text - words?: Array<{ // Word-level timestamps - word: string - start: number - end: number - confidence?: number - }> + confidence?: number + speaker?: string // Present for diarized output + }> + words?: Array<{ // Word-level timestamps + word: string + start: number + end: number }> } ``` @@ -208,9 +241,9 @@ async function transcribeAudio(filepath: string) { adapter: openaiTranscription('whisper-1'), audio: audioFile, language: 'en', + responseFormat: 'verbose_json', modelOptions: { - response_format: 'verbose_json', - include: ['segment', 'word'], + timestamp_granularities: ['word', 'segment'], }, }) @@ -540,5 +573,6 @@ const adapter = createOpenaiTranscription('your-openai-api-key') 5. **Prompting**: Use the `prompt` option to provide context or expected vocabulary (e.g., technical terms, names). -6. **Timestamps**: Request `verbose_json` format and enable `include: ['word', 'segment']` when you need timing information for captions or synchronization. +6. **Timestamps**: Request `responseFormat: 'verbose_json'` and set `modelOptions.timestamp_granularities` when you need timing information for captions or synchronization. +7. **Diarization**: Use `gpt-4o-transcribe-diarize` with `diarized_json` output for multi-speaker audio. Keep `chunking_strategy: 'auto'` unless you need custom VAD tuning. diff --git a/docs/reference/interfaces/TranscriptionOptions.md b/docs/reference/interfaces/TranscriptionOptions.md index 4a6192733..311cae231 100644 --- a/docs/reference/interfaces/TranscriptionOptions.md +++ b/docs/reference/interfaces/TranscriptionOptions.md @@ -95,7 +95,7 @@ An optional prompt to guide the transcription ### responseFormat? ```ts -optional responseFormat: "text" | "json" | "srt" | "verbose_json" | "vtt"; +optional responseFormat: "text" | "json" | "srt" | "verbose_json" | "vtt" | "diarized_json"; ``` Defined in: [packages/ai/src/types.ts:1693](https://github.com/TanStack/ai/blob/main/packages/ai/src/types.ts#L1693) diff --git a/packages/ai-client/src/generation-types.ts b/packages/ai-client/src/generation-types.ts index 347be9d1b..2a2974d3e 100644 --- a/packages/ai-client/src/generation-types.ts +++ b/packages/ai-client/src/generation-types.ts @@ -265,7 +265,13 @@ export interface TranscriptionGenerateInput { /** An optional prompt to guide the transcription */ prompt?: string /** The format of the transcription output */ - responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' + responseFormat?: + | 'json' + | 'text' + | 'srt' + | 'verbose_json' + | 'vtt' + | 'diarized_json' /** Model-specific options */ modelOptions?: Record } diff --git a/packages/ai-openai/src/adapters/transcription.ts b/packages/ai-openai/src/adapters/transcription.ts index 060dbf0e8..44cfc06fb 100644 --- a/packages/ai-openai/src/adapters/transcription.ts +++ b/packages/ai-openai/src/adapters/transcription.ts @@ -13,6 +13,24 @@ import type { OpenAITranscriptionModel } from '../model-meta' import type { OpenAITranscriptionProviderOptions } from '../audio/transcription-provider-options' import type { OpenAIClientConfig } from '../utils/client' +const DIARIZE_MODELS = ['gpt-4o-transcribe-diarize'] as const + +type DiarizeModel = (typeof DIARIZE_MODELS)[number] + +function isDiarizeModel(model: string): model is DiarizeModel { + return DIARIZE_MODELS.includes(model as DiarizeModel) +} + +function mapDiarizedSegmentId(id: string, index: number): number { + const match = /^seg_(\d+)$/.exec(id) + if (match) return Number(match[1]) + + const numericId = Number(id) + if (!Number.isNaN(numericId)) return numericId + + return index +} + /** * Configuration for OpenAI Transcription adapter */ @@ -22,12 +40,12 @@ export interface OpenAITranscriptionConfig extends OpenAIClientConfig {} * OpenAI Transcription (Speech-to-Text) Adapter * * Tree-shakeable adapter for OpenAI audio transcription functionality. - * Supports whisper-1, gpt-4o-transcribe, gpt-4o-mini-transcribe, and gpt-4o-transcribe-diarize models. + * Supports whisper-1, gpt-4o-transcribe, gpt-4o-mini-transcribe, and gpt-4o-transcribe-diarize. * * Features: * - Multiple transcription models with different capabilities * - Language detection or specification - * - Multiple output formats: json, text, srt, verbose_json, vtt + * - Multiple output formats: json, text, srt, verbose_json, vtt, diarized_json * - Word and segment-level timestamps (with verbose_json — whisper-1 only; * gpt-4o-* transcribe models accept only json/text and reject verbose_json * with HTTP 400) @@ -52,12 +70,24 @@ export class OpenAITranscriptionAdapter< options const file = this.prepareAudioFile(audio) + const isDiarizeTranscriptionModel = isDiarizeModel(model) + const useDiarized = + responseFormat === 'diarized_json' || + (isDiarizeTranscriptionModel && responseFormat === undefined) + this.validateDiarizationOptions({ + model, + prompt, + responseFormat, + modelOptions, + }) // With exactOptionalPropertyTypes, vendor SDK request shapes reject // `T | undefined` in optional fields. Build the request incrementally and // only set optional fields when they're actually defined. - const responseFormatValue = this.mapResponseFormat(responseFormat) - const request: OpenAI_SDK.Audio.TranscriptionCreateParams = { + const responseFormatValue = useDiarized + ? 'diarized_json' + : this.mapResponseFormat(responseFormat) + const request: OpenAI_SDK.Audio.TranscriptionCreateParamsNonStreaming = { model, file, ...(modelOptions ?? {}), @@ -68,6 +98,12 @@ export class OpenAITranscriptionAdapter< if (prompt !== undefined) { request.prompt = prompt } + if ( + isDiarizeTranscriptionModel && + modelOptions?.chunking_strategy === undefined + ) { + request.chunking_strategy = 'auto' + } if (responseFormatValue !== undefined) { request.response_format = responseFormatValue } @@ -75,14 +111,38 @@ export class OpenAITranscriptionAdapter< // Only Whisper supports verbose_json. The gpt-4o-* transcribe models // accept only json/text and reject verbose_json with HTTP 400. const useVerbose = - responseFormat === 'verbose_json' || + (!useDiarized && responseFormat === 'verbose_json') || (!responseFormat && model === 'whisper-1') try { options.logger.request( - `activity=transcription provider=${this.name} model=${model} verbose=${useVerbose}`, + `activity=transcription provider=${this.name} model=${model} verbose=${useVerbose} diarized=${useDiarized}`, { provider: this.name, model }, ) + if (useDiarized) { + const response = (await this.client.audio.transcriptions.create( + request, + )) as OpenAI_SDK.Audio.TranscriptionDiarized + + const segments = response.segments.map( + (segment, index): TranscriptionSegment => ({ + id: mapDiarizedSegmentId(segment.id, index), + start: segment.start, + end: segment.end, + text: segment.text, + speaker: segment.speaker, + }), + ) + + return { + id: generateId(this.name), + model, + text: response.text, + duration: response.duration, + ...(segments.length > 0 && { segments }), + } + } + if (useVerbose) { const response = (await this.client.audio.transcriptions.create({ ...request, @@ -118,15 +178,15 @@ export class OpenAITranscriptionAdapter< ...(segments !== undefined && { segments }), ...(words !== undefined && { words }), } - } else { - const response = await this.client.audio.transcriptions.create(request) + } - return { - id: generateId(this.name), - model, - text: typeof response === 'string' ? response : response.text, - ...(language !== undefined && { language }), - } + const response = await this.client.audio.transcriptions.create(request) + + return { + id: generateId(this.name), + model, + text: typeof response === 'string' ? response : response.text, + ...(language !== undefined && { language }), } } catch (error: unknown) { options.logger.errors(`${this.name}.transcribe fatal`, { @@ -182,8 +242,81 @@ export class OpenAITranscriptionAdapter< } } + private validateDiarizationOptions({ + model, + prompt, + responseFormat, + modelOptions, + }: Pick< + TranscriptionOptions, + 'model' | 'prompt' | 'responseFormat' | 'modelOptions' + >): void { + const isDiarizeTranscriptionModel = isDiarizeModel(model) + + if ( + !isDiarizeTranscriptionModel && + (responseFormat === 'diarized_json' || + modelOptions?.known_speaker_names !== undefined || + modelOptions?.known_speaker_references !== undefined) + ) { + throw new Error( + 'OpenAI speaker diarization options are only supported with OpenAI diarization transcription models.', + ) + } + + if (!isDiarizeTranscriptionModel) return + + if (prompt !== undefined) { + throw new Error( + 'OpenAI diarization transcription models do not support prompts.', + ) + } + + if (modelOptions?.include !== undefined) { + throw new Error( + 'OpenAI diarization transcription models do not support the include option.', + ) + } + + if (modelOptions?.timestamp_granularities !== undefined) { + throw new Error( + 'OpenAI diarization transcription models do not support timestamp_granularities.', + ) + } + + if (modelOptions?.known_speaker_names !== undefined) { + const knownSpeakerCount = modelOptions.known_speaker_names.length + if (knownSpeakerCount > 4) { + throw new Error( + 'OpenAI diarization transcription models support at most 4 known speaker names.', + ) + } + } + + if (modelOptions?.known_speaker_references !== undefined) { + const knownSpeakerReferenceCount = + modelOptions.known_speaker_references.length + if (knownSpeakerReferenceCount > 4) { + throw new Error( + 'OpenAI diarization transcription models support at most 4 known speaker references.', + ) + } + } + + if ( + modelOptions?.known_speaker_names !== undefined && + modelOptions.known_speaker_references !== undefined && + modelOptions.known_speaker_names.length !== + modelOptions.known_speaker_references.length + ) { + throw new Error( + 'OpenAI diarization known_speaker_names and known_speaker_references must have matching lengths.', + ) + } + } + protected mapResponseFormat( - format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt', + format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' | 'diarized_json', ): OpenAI_SDK.Audio.TranscriptionCreateParams['response_format'] { if (!format) return 'json' return format diff --git a/packages/ai-openai/src/audio/transcription-provider-options.ts b/packages/ai-openai/src/audio/transcription-provider-options.ts index 17f619cb0..befa5df6a 100644 --- a/packages/ai-openai/src/audio/transcription-provider-options.ts +++ b/packages/ai-openai/src/audio/transcription-provider-options.ts @@ -38,4 +38,14 @@ export interface OpenAITranscriptionProviderOptions { * Optional list of audio samples (as data URLs) that contain known speaker references matching known_speaker_names[]. Each sample must be between 2 and 10 seconds, and can use any of the same input audio formats supported by file. */ known_speaker_references?: Array + /** + * Controls how the audio is cut into chunks. Required by OpenAI when + * `gpt-4o-transcribe-diarize` input is longer than 30 seconds. Use `"auto"` + * for the service-managed VAD strategy, or pass a `server_vad` config to tune + * segmentation. + */ + chunking_strategy?: + | 'auto' + | OpenAI.Audio.TranscriptionCreateParams.VadConfig + | null } diff --git a/packages/ai-openai/tests/transcription-adapter.test.ts b/packages/ai-openai/tests/transcription-adapter.test.ts new file mode 100644 index 000000000..fc68d3b86 --- /dev/null +++ b/packages/ai-openai/tests/transcription-adapter.test.ts @@ -0,0 +1,300 @@ +import { describe, expect, it, vi } from 'vitest' +import { resolveDebugOption } from '@tanstack/ai/adapter-internals' +import { + OpenAITranscriptionAdapter, + createOpenaiTranscription, +} from '../src/adapters/transcription' +import type OpenAI from 'openai' +import type { OpenAITranscriptionModel } from '../src/model-meta' + +const testLogger = resolveDebugOption(false) + +class TestOpenAITranscriptionAdapter< + TModel extends OpenAITranscriptionModel, +> extends OpenAITranscriptionAdapter { + spyOnTranscriptionsCreate() { + return vi.spyOn(this.client.audio.transcriptions, 'create') + } +} + +describe('OpenAI transcription adapter', () => { + it('creates a diarization-capable adapter', () => { + const adapter = createOpenaiTranscription( + 'gpt-4o-transcribe-diarize', + 'test-api-key', + ) + + expect(adapter).toBeInstanceOf(OpenAITranscriptionAdapter) + expect(adapter.name).toBe('openai') + }) + + it('defaults the diarization model to diarized_json with automatic chunking', async () => { + const mockResponse: OpenAI.Audio.TranscriptionDiarized = { + text: 'Agent: Hello\nCustomer: Hi', + duration: 2.2, + task: 'transcribe', + segments: [ + { + id: 'seg_0', + type: 'transcript.text.segment', + start: 0, + end: 1.4, + text: 'Hello', + speaker: 'agent', + }, + { + id: 'seg_1', + type: 'transcript.text.segment', + start: 1.5, + end: 2.2, + text: 'Hi', + speaker: 'customer', + }, + ], + } + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + const mockCreate = adapter + .spyOnTranscriptionsCreate() + .mockResolvedValueOnce(mockResponse) + + const result = await adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'meeting.wav', { type: 'audio/wav' }), + logger: testLogger, + }) + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'gpt-4o-transcribe-diarize', + response_format: 'diarized_json', + chunking_strategy: 'auto', + }), + ) + expect(result.text).toBe('Agent: Hello\nCustomer: Hi') + expect(result.segments).toEqual([ + { + id: 0, + start: 0, + end: 1.4, + text: 'Hello', + speaker: 'agent', + }, + { + id: 1, + start: 1.5, + end: 2.2, + text: 'Hi', + speaker: 'customer', + }, + ]) + }) + + it('passes explicit diarization chunking and known speaker references', async () => { + const mockResponse: OpenAI.Audio.TranscriptionDiarized = { + text: 'Speaker text', + duration: 1, + task: 'transcribe', + segments: [ + { + id: 'speaker-intro', + type: 'transcript.text.segment', + start: 0, + end: 1, + text: 'Speaker text', + speaker: 'agent', + }, + ], + } + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + const mockCreate = adapter + .spyOnTranscriptionsCreate() + .mockResolvedValueOnce(mockResponse) + + const result = await adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'meeting.wav', { type: 'audio/wav' }), + responseFormat: 'diarized_json', + modelOptions: { + chunking_strategy: { + type: 'server_vad', + threshold: 0.5, + prefix_padding_ms: 300, + silence_duration_ms: 500, + }, + known_speaker_names: ['agent'], + known_speaker_references: ['data:audio/wav;base64,AAA='], + }, + logger: testLogger, + }) + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + response_format: 'diarized_json', + chunking_strategy: { + type: 'server_vad', + threshold: 0.5, + prefix_padding_ms: 300, + silence_duration_ms: 500, + }, + known_speaker_names: ['agent'], + known_speaker_references: ['data:audio/wav;base64,AAA='], + }), + ) + expect(result.segments?.[0]?.id).toBe(0) + }) + + it('respects explicit null chunking for short diarization inputs', async () => { + const mockResponse: OpenAI.Audio.TranscriptionDiarized = { + text: 'Hello', + duration: 1, + task: 'transcribe', + segments: [], + } + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + const mockCreate = adapter + .spyOnTranscriptionsCreate() + .mockResolvedValueOnce(mockResponse) + + await adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'short.wav', { type: 'audio/wav' }), + modelOptions: { + chunking_strategy: null, + }, + logger: testLogger, + }) + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + chunking_strategy: null, + }), + ) + }) + + it('allows json or text response formats for the diarization model', async () => { + const mockResponse: OpenAI.Audio.Transcription = { + text: 'Hello', + } + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + const mockCreate = adapter + .spyOnTranscriptionsCreate() + .mockResolvedValueOnce(mockResponse) + + const result = await adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'short.wav', { type: 'audio/wav' }), + responseFormat: 'json', + logger: testLogger, + }) + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + response_format: 'json', + chunking_strategy: 'auto', + }), + ) + expect(result).toMatchObject({ + model: 'gpt-4o-transcribe-diarize', + text: 'Hello', + }) + }) + + it('rejects diarized_json with non-diarization models', async () => { + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'whisper-1', + ) + + await expect( + adapter.transcribe({ + model: 'whisper-1', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + responseFormat: 'diarized_json', + logger: testLogger, + }), + ).rejects.toThrow('speaker diarization options') + }) + + it('rejects unsupported diarization prompt and timestamp options', async () => { + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + prompt: 'Use product vocabulary', + logger: testLogger, + }), + ).rejects.toThrow('do not support prompts') + + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + timestamp_granularities: ['word'], + }, + logger: testLogger, + }), + ).rejects.toThrow('timestamp_granularities') + }) + + it('rejects unsupported diarization include and too many known speakers', async () => { + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + include: ['logprobs'], + }, + logger: testLogger, + }), + ).rejects.toThrow('include') + + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + known_speaker_names: ['a', 'b', 'c', 'd', 'e'], + }, + logger: testLogger, + }), + ).rejects.toThrow('at most 4') + + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + modelOptions: { + known_speaker_names: ['agent'], + known_speaker_references: [ + 'data:audio/wav;base64,AAA=', + 'data:audio/wav;base64,BBB=', + ], + }, + logger: testLogger, + }), + ).rejects.toThrow('matching lengths') + }) +}) diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md index 09a552b73..239e29308 100644 --- a/packages/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/ai/skills/ai-core/media-generation/SKILL.md @@ -259,7 +259,7 @@ const { generate, result, isLoading } = useGenerateSpeech({ ### 4. Audio Transcription Adapter: `openaiTranscription` (whisper-1, gpt-4o-transcribe, -gpt-4o-mini-transcribe). +gpt-4o-mini-transcribe, gpt-4o-transcribe-diarize). ```typescript import { generateTranscription } from '@tanstack/ai' @@ -271,7 +271,7 @@ const result = await generateTranscription({ language: 'en', responseFormat: 'verbose_json', modelOptions: { - include: ['segment', 'word'], + timestamp_granularities: ['word', 'segment'], }, }) @@ -281,6 +281,10 @@ const result = await generateTranscription({ // result.segments -- timestamped segments with optional word-level timestamps ``` +For speaker diarization, use `openaiTranscription('gpt-4o-transcribe-diarize')`. +It defaults to `responseFormat: 'diarized_json'` and `chunking_strategy: 'auto'`; +do not pass `prompt`, `include`, or `timestamp_granularities` with this model. + Client hook: ```tsx diff --git a/packages/ai/src/activities/generateTranscription/index.ts b/packages/ai/src/activities/generateTranscription/index.ts index 90262e9e9..9705d57b0 100644 --- a/packages/ai/src/activities/generateTranscription/index.ts +++ b/packages/ai/src/activities/generateTranscription/index.ts @@ -59,7 +59,13 @@ export interface TranscriptionActivityOptions< /** An optional prompt to guide the transcription */ prompt?: string /** The format of the transcription output */ - responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' + responseFormat?: + | 'json' + | 'text' + | 'srt' + | 'verbose_json' + | 'vtt' + | 'diarized_json' /** Provider-specific options for transcription */ modelOptions?: TranscriptionProviderOptions /** diff --git a/packages/ai/src/types.ts b/packages/ai/src/types.ts index 6c596a2dc..01c768a9d 100644 --- a/packages/ai/src/types.ts +++ b/packages/ai/src/types.ts @@ -1690,7 +1690,13 @@ export interface TranscriptionOptions< /** An optional prompt to guide the transcription */ prompt?: string /** The format of the transcription output */ - responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' + responseFormat?: + | 'json' + | 'text' + | 'srt' + | 'verbose_json' + | 'vtt' + | 'diarized_json' /** Model-specific options for transcription */ modelOptions?: TProviderOptions /** From 05dfb5360e845db5abe6d3bbe926aefd78c6943c Mon Sep 17 00:00:00 2001 From: 8times4 <46720448+8times4@users.noreply.github.com> Date: Thu, 28 May 2026 13:08:58 +0200 Subject: [PATCH 2/2] fix coderabbit recommendations --- .../ai-openai/src/adapters/transcription.ts | 35 ++++++++++++++++--- .../tests/transcription-adapter.test.ts | 20 +++++++++++ 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/packages/ai-openai/src/adapters/transcription.ts b/packages/ai-openai/src/adapters/transcription.ts index 44cfc06fb..892f7a414 100644 --- a/packages/ai-openai/src/adapters/transcription.ts +++ b/packages/ai-openai/src/adapters/transcription.ts @@ -14,8 +14,12 @@ import type { OpenAITranscriptionProviderOptions } from '../audio/transcription- import type { OpenAIClientConfig } from '../utils/client' const DIARIZE_MODELS = ['gpt-4o-transcribe-diarize'] as const +const DIARIZE_RESPONSE_FORMATS = ['json', 'text', 'diarized_json'] as const type DiarizeModel = (typeof DIARIZE_MODELS)[number] +type OpenAITranscriptionResponseFormat = NonNullable< + TranscriptionOptions['responseFormat'] +> function isDiarizeModel(model: string): model is DiarizeModel { return DIARIZE_MODELS.includes(model as DiarizeModel) @@ -104,9 +108,7 @@ export class OpenAITranscriptionAdapter< ) { request.chunking_strategy = 'auto' } - if (responseFormatValue !== undefined) { - request.response_format = responseFormatValue - } + request.response_format = responseFormatValue // Only Whisper supports verbose_json. The gpt-4o-* transcribe models // accept only json/text and reject verbose_json with HTTP 400. @@ -266,6 +268,29 @@ export class OpenAITranscriptionAdapter< if (!isDiarizeTranscriptionModel) return + const modelOptionsResponseFormat = ( + modelOptions as + | { responseFormat?: OpenAITranscriptionResponseFormat } + | undefined + )?.responseFormat + const requestedResponseFormats = [ + this.mapResponseFormat(responseFormat), + ...(modelOptionsResponseFormat !== undefined + ? [this.mapResponseFormat(modelOptionsResponseFormat)] + : []), + ] + const unsupportedResponseFormat = requestedResponseFormats.find( + (format) => + !DIARIZE_RESPONSE_FORMATS.includes( + format as (typeof DIARIZE_RESPONSE_FORMATS)[number], + ), + ) + if (unsupportedResponseFormat !== undefined) { + throw new Error( + 'OpenAI diarization transcription models only support json, text, and diarized_json response formats.', + ) + } + if (prompt !== undefined) { throw new Error( 'OpenAI diarization transcription models do not support prompts.', @@ -316,8 +341,8 @@ export class OpenAITranscriptionAdapter< } protected mapResponseFormat( - format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' | 'diarized_json', - ): OpenAI_SDK.Audio.TranscriptionCreateParams['response_format'] { + format?: OpenAITranscriptionResponseFormat, + ): OpenAITranscriptionResponseFormat { if (!format) return 'json' return format } diff --git a/packages/ai-openai/tests/transcription-adapter.test.ts b/packages/ai-openai/tests/transcription-adapter.test.ts index fc68d3b86..993bcb135 100644 --- a/packages/ai-openai/tests/transcription-adapter.test.ts +++ b/packages/ai-openai/tests/transcription-adapter.test.ts @@ -211,6 +211,26 @@ describe('OpenAI transcription adapter', () => { }) }) + it('rejects unsupported response formats for the diarization model', async () => { + const adapter = new TestOpenAITranscriptionAdapter( + { apiKey: 'test-api-key' }, + 'gpt-4o-transcribe-diarize', + ) + + for (const responseFormat of ['srt', 'vtt', 'verbose_json'] as const) { + await expect( + adapter.transcribe({ + model: 'gpt-4o-transcribe-diarize', + audio: new File([], 'audio.wav', { type: 'audio/wav' }), + responseFormat, + logger: testLogger, + }), + ).rejects.toThrow( + 'diarization transcription models only support json, text, and diarized_json', + ) + } + }) + it('rejects diarized_json with non-diarization models', async () => { const adapter = new TestOpenAITranscriptionAdapter( { apiKey: 'test-api-key' },