From a59d368bd48d204c04f7f5a132ab85c40b6e9fb3 Mon Sep 17 00:00:00 2001
From: 8times4 <46720448+8times4@users.noreply.github.com>
Date: Wed, 27 May 2026 16:46:26 +0200
Subject: [PATCH 1/2] add diarization support

---
 .../openai-transcription-diarization.md       |   7 +
 docs/adapters/openai.md                       |  32 +-
 docs/comparison/vercel-ai-sdk.md              |   2 +-
 docs/media/generation-hooks.md                |   2 +-
 docs/media/transcription.md                   |  66 +++-
 .../interfaces/TranscriptionOptions.md        |   2 +-
 packages/ai-client/src/generation-types.ts    |   8 +-
 .../ai-openai/src/adapters/transcription.ts   | 163 +++++++++-
 .../audio/transcription-provider-options.ts   |  10 +
 .../tests/transcription-adapter.test.ts       | 300 ++++++++++++++++++
 .../skills/ai-core/media-generation/SKILL.md  |   8 +-
 .../activities/generateTranscription/index.ts |   8 +-
 packages/ai/src/types.ts                      |   8 +-
 13 files changed, 574 insertions(+), 42 deletions(-)
 create mode 100644 .changeset/openai-transcription-diarization.md
 create mode 100644 packages/ai-openai/tests/transcription-adapter.test.ts

diff --git a/.changeset/openai-transcription-diarization.md b/.changeset/openai-transcription-diarization.md
new file mode 100644
index 000000000..67769277c
--- /dev/null
+++ b/.changeset/openai-transcription-diarization.md
@@ -0,0 +1,7 @@
+---
+'@tanstack/ai': minor
+'@tanstack/ai-client': minor
+'@tanstack/ai-openai': minor
+---
+
+Add OpenAI transcription diarization support with `diarized_json` output, speaker-labeled segments, diarization model validation, chunking strategy options, and docs.
diff --git a/docs/adapters/openai.md b/docs/adapters/openai.md
index e780a9a0e..a59a68839 100644
--- a/docs/adapters/openai.md
+++ b/docs/adapters/openai.md
@@ -294,10 +294,11 @@ console.log(result.text); // Transcribed text
 const result = await generateTranscription({
   adapter: openaiTranscription("whisper-1"),
   audio: audioFile,
+  responseFormat: "verbose_json",
+  prompt: "Technical terms: API, SDK",
   modelOptions: {
-    response_format: "verbose_json", // Get timestamps
     temperature: 0,
-    prompt: "Technical terms: API, SDK",
+    timestamp_granularities: ["word", "segment"],
   },
 });
 
@@ -305,6 +306,31 @@ const result = await generateTranscription({
 console.log(result.segments);
 ```
 
+### Speaker Diarization
+
+Use `gpt-4o-transcribe-diarize` for speaker-labeled transcripts:
+
+```typescript
+const result = await generateTranscription({
+  adapter: openaiTranscription("gpt-4o-transcribe-diarize"),
+  audio: meetingAudioFile,
+  modelOptions: {
+    chunking_strategy: "auto",
+    known_speaker_names: ["agent", "customer"],
+    known_speaker_references: [
+      "data:audio/wav;base64,...",
+      "data:audio/wav;base64,...",
+    ],
+  },
+});
+
+for (const segment of result.segments ?? []) {
+  console.log(segment.speaker, segment.start, segment.end, segment.text);
+}
+```
+
+`gpt-4o-transcribe-diarize` defaults to `responseFormat: "diarized_json"` and `chunking_strategy: "auto"`. OpenAI does not support `prompt`, `include`, or `timestamp_granularities` with diarized transcription.
+
 ## Environment Variables
 
 Set your API key in environment variables:
@@ -353,7 +379,7 @@ Creates an OpenAI text-to-speech adapter.
 
 ### `openaiTranscription(model, config?)` / `createOpenaiTranscription(model, apiKey, config?)`
 
-Creates an OpenAI transcription adapter (Whisper).
+Creates an OpenAI transcription adapter for Whisper, GPT-4o transcription, and GPT-4o diarized transcription models.
 
 ### `openaiVideo(model, config?)` / `createOpenaiVideo(model, apiKey, config?)`
 
diff --git a/docs/comparison/vercel-ai-sdk.md b/docs/comparison/vercel-ai-sdk.md
index 415a4731c..eba344284 100644
--- a/docs/comparison/vercel-ai-sdk.md
+++ b/docs/comparison/vercel-ai-sdk.md
@@ -389,7 +389,7 @@ const result = await generateSpeech({
 })
 ```
 
-**Transcription** - `generateTranscription()` supports 5 output formats (json, text, srt, verbose_json, vtt), word-level timestamps with confidence scores, and speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model.
+**Transcription** - `generateTranscription()` supports 6 output formats (json, text, srt, verbose_json, vtt, diarized_json), word-level timestamps with confidence scores, and speaker diarization via OpenAI's `gpt-4o-transcribe-diarize` model.
 
 ```ts
 import { generateTranscription } from '@tanstack/ai'
diff --git a/docs/media/generation-hooks.md b/docs/media/generation-hooks.md
index d9adadc6a..bc2ac69a0 100644
--- a/docs/media/generation-hooks.md
+++ b/docs/media/generation-hooks.md
@@ -214,7 +214,7 @@ The `generate` function accepts a `TranscriptionGenerateInput`:
 | `audio` | `string \| File \| Blob` | Audio data -- base64 string, File, or Blob (required) |
 | `language` | `string` | Language in ISO-639-1 format (e.g., `"en"`) |
 | `prompt` | `string` | Optional prompt to guide the transcription |
-| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt'` | Output format |
+| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt' \| 'diarized_json'` | Output format |
 | `modelOptions` | `Record<string, any>` | Model-specific options |
 
 ## useSummarize
diff --git a/docs/media/transcription.md b/docs/media/transcription.md
index eaf64dfad..f15336c40 100644
--- a/docs/media/transcription.md
+++ b/docs/media/transcription.md
@@ -2,7 +2,7 @@
 title: Transcription
 id: transcription
 order: 4
-description: "Transcribe audio to text with OpenAI Whisper and GPT-4o-transcribe via TanStack AI's generateTranscription() API."
+description: "Transcribe audio to text with OpenAI Whisper and GPT-4o transcription models, including speaker diarization, via TanStack AI's generateTranscription() API."
 keywords:
   - tanstack ai
   - transcription
@@ -22,7 +22,7 @@ TanStack AI provides support for audio transcription (speech-to-text) through de
 Audio transcription is handled by transcription adapters that follow the same tree-shakeable architecture as other adapters in TanStack AI.
 
 Currently supported:
-- **OpenAI**: Whisper-1, GPT-4o-transcribe, GPT-4o-mini-transcribe
+- **OpenAI**: Whisper-1, GPT-4o-transcribe, GPT-4o-mini-transcribe, GPT-4o-transcribe-diarize
 - **fal.ai**: Whisper, Wizper, speech-to-text turbo, ElevenLabs speech-to-text
 
 ## Basic Usage
@@ -107,6 +107,8 @@ for (const segment of result.segments ?? []) {
 |--------|------|-------------|
 | `audio` | `File \| string` | Audio data (File object or base64 string) - required |
 | `language` | `string` | Language code (e.g., "en", "es", "fr") |
+| `prompt` | `string` | Optional prompt to guide transcription style or terms. Not supported with `gpt-4o-transcribe-diarize`. |
+| `responseFormat` | `'json' \| 'text' \| 'srt' \| 'verbose_json' \| 'vtt' \| 'diarized_json'` | Output format |
 
 ### Supported Languages
 
@@ -135,20 +137,23 @@ Whisper supports many languages. Common codes include:
 const result = await generateTranscription({
   adapter: openaiTranscription('whisper-1'),
   audio: audioFile,
+  responseFormat: 'verbose_json',
+  prompt: 'Technical terms: API, SDK, CLI',
   modelOptions: {
-    response_format: 'verbose_json', // Get detailed output with timestamps
     temperature: 0, // Lower = more deterministic
-    prompt: 'Technical terms: API, SDK, CLI', // Guide transcription
+    timestamp_granularities: ['word', 'segment'],
   },
 })
 ```
 
 | Option | Type | Description |
 |--------|------|-------------|
-| `response_format` | `string` | Output format: "json", "text", "srt", "verbose_json", "vtt" |
 | `temperature` | `number` | Sampling temperature (0 to 1) |
-| `prompt` | `string` | Optional text to guide transcription style |
-| `include` | `string[]` | Timestamp granularity: ["word"], ["segment"], or both |
+| `include` | `string[]` | Additional response data such as `logprobs`; only available with `json` responses on supported GPT-4o transcription models |
+| `timestamp_granularities` | `('word' \| 'segment')[]` | Timestamp detail for `whisper-1` with `responseFormat: 'verbose_json'` |
+| `chunking_strategy` | `'auto' \| { type: 'server_vad', ... } \| null` | Audio chunking strategy for `gpt-4o-transcribe-diarize`; required by OpenAI for diarization inputs longer than 30 seconds |
+| `known_speaker_names` | `string[]` | Up to four speaker labels for diarization |
+| `known_speaker_references` | `string[]` | 2-10 second data URL audio samples matching `known_speaker_names` |
 
 ### Response Formats
 
@@ -159,6 +164,32 @@ const result = await generateTranscription({
 | `srt` | SubRip subtitle format |
 | `verbose_json` | Detailed JSON with timestamps and segments |
 | `vtt` | WebVTT subtitle format |
+| `diarized_json` | JSON with speaker-labeled segments. Only supported by `gpt-4o-transcribe-diarize`. |
+
+### Speaker Diarization
+
+Use `gpt-4o-transcribe-diarize` when you need speaker labels. TanStack AI defaults this model to `responseFormat: 'diarized_json'` and sends `chunking_strategy: 'auto'` unless you provide a chunking strategy yourself.
+
+```typescript
+const result = await generateTranscription({
+  adapter: openaiTranscription('gpt-4o-transcribe-diarize'),
+  audio: meetingAudioFile,
+  modelOptions: {
+    chunking_strategy: 'auto',
+    known_speaker_names: ['agent', 'customer'],
+    known_speaker_references: [
+      'data:audio/wav;base64,...',
+      'data:audio/wav;base64,...',
+    ],
+  },
+})
+
+for (const segment of result.segments ?? []) {
+  console.log(segment.speaker, segment.start, segment.end, segment.text)
+}
+```
+
+OpenAI accepts up to four known speaker references. The diarization model does not support `prompt`, `include`, or `timestamp_granularities`; the adapter rejects those combinations before making the API request.
 
 ## Response Format
 
@@ -172,15 +203,17 @@ interface TranscriptionResult {
   language?: string    // Detected/specified language
   duration?: number    // Audio duration in seconds
   segments?: Array<{   // Timestamped segments
+    id: number
     start: number      // Start time in seconds
     end: number        // End time in seconds
     text: string       // Segment text
-    words?: Array<{    // Word-level timestamps
-      word: string
-      start: number
-      end: number
-      confidence?: number
-    }>
+    confidence?: number
+    speaker?: string   // Present for diarized output
+  }>
+  words?: Array<{      // Word-level timestamps
+    word: string
+    start: number
+    end: number
   }>
 }
 ```
@@ -208,9 +241,9 @@ async function transcribeAudio(filepath: string) {
     adapter: openaiTranscription('whisper-1'),
     audio: audioFile,
     language: 'en',
+    responseFormat: 'verbose_json',
     modelOptions: {
-      response_format: 'verbose_json',
-      include: ['segment', 'word'],
+      timestamp_granularities: ['word', 'segment'],
     },
   })
 
@@ -540,5 +573,6 @@ const adapter = createOpenaiTranscription('your-openai-api-key')
 
 5. **Prompting**: Use the `prompt` option to provide context or expected vocabulary (e.g., technical terms, names).
 
-6. **Timestamps**: Request `verbose_json` format and enable `include: ['word', 'segment']` when you need timing information for captions or synchronization.
+6. **Timestamps**: Request `responseFormat: 'verbose_json'` and set `modelOptions.timestamp_granularities` when you need timing information for captions or synchronization.
 
+7. **Diarization**: Use `gpt-4o-transcribe-diarize` with `diarized_json` output for multi-speaker audio. Keep `chunking_strategy: 'auto'` unless you need custom VAD tuning.
diff --git a/docs/reference/interfaces/TranscriptionOptions.md b/docs/reference/interfaces/TranscriptionOptions.md
index 4a6192733..311cae231 100644
--- a/docs/reference/interfaces/TranscriptionOptions.md
+++ b/docs/reference/interfaces/TranscriptionOptions.md
@@ -95,7 +95,7 @@ An optional prompt to guide the transcription
 ### responseFormat?
 
 ```ts
-optional responseFormat: "text" | "json" | "srt" | "verbose_json" | "vtt";
+optional responseFormat: "text" | "json" | "srt" | "verbose_json" | "vtt" | "diarized_json";
 ```
 
 Defined in: [packages/ai/src/types.ts:1693](https://github.com/TanStack/ai/blob/main/packages/ai/src/types.ts#L1693)
diff --git a/packages/ai-client/src/generation-types.ts b/packages/ai-client/src/generation-types.ts
index 347be9d1b..2a2974d3e 100644
--- a/packages/ai-client/src/generation-types.ts
+++ b/packages/ai-client/src/generation-types.ts
@@ -265,7 +265,13 @@ export interface TranscriptionGenerateInput {
   /** An optional prompt to guide the transcription */
   prompt?: string
   /** The format of the transcription output */
-  responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'
+  responseFormat?:
+    | 'json'
+    | 'text'
+    | 'srt'
+    | 'verbose_json'
+    | 'vtt'
+    | 'diarized_json'
   /** Model-specific options */
   modelOptions?: Record<string, any>
 }
diff --git a/packages/ai-openai/src/adapters/transcription.ts b/packages/ai-openai/src/adapters/transcription.ts
index 060dbf0e8..44cfc06fb 100644
--- a/packages/ai-openai/src/adapters/transcription.ts
+++ b/packages/ai-openai/src/adapters/transcription.ts
@@ -13,6 +13,24 @@ import type { OpenAITranscriptionModel } from '../model-meta'
 import type { OpenAITranscriptionProviderOptions } from '../audio/transcription-provider-options'
 import type { OpenAIClientConfig } from '../utils/client'
 
+const DIARIZE_MODELS = ['gpt-4o-transcribe-diarize'] as const
+
+type DiarizeModel = (typeof DIARIZE_MODELS)[number]
+
+function isDiarizeModel(model: string): model is DiarizeModel {
+  return DIARIZE_MODELS.includes(model as DiarizeModel)
+}
+
+function mapDiarizedSegmentId(id: string, index: number): number {
+  const match = /^seg_(\d+)$/.exec(id)
+  if (match) return Number(match[1])
+
+  const numericId = Number(id)
+  if (!Number.isNaN(numericId)) return numericId
+
+  return index
+}
+
 /**
  * Configuration for OpenAI Transcription adapter
  */
@@ -22,12 +40,12 @@ export interface OpenAITranscriptionConfig extends OpenAIClientConfig {}
  * OpenAI Transcription (Speech-to-Text) Adapter
  *
  * Tree-shakeable adapter for OpenAI audio transcription functionality.
- * Supports whisper-1, gpt-4o-transcribe, gpt-4o-mini-transcribe, and gpt-4o-transcribe-diarize models.
+ * Supports whisper-1, gpt-4o-transcribe, gpt-4o-mini-transcribe, and gpt-4o-transcribe-diarize.
  *
  * Features:
  * - Multiple transcription models with different capabilities
  * - Language detection or specification
- * - Multiple output formats: json, text, srt, verbose_json, vtt
+ * - Multiple output formats: json, text, srt, verbose_json, vtt, diarized_json
  * - Word and segment-level timestamps (with verbose_json — whisper-1 only;
  *   gpt-4o-* transcribe models accept only json/text and reject verbose_json
  *   with HTTP 400)
@@ -52,12 +70,24 @@ export class OpenAITranscriptionAdapter<
       options
 
     const file = this.prepareAudioFile(audio)
+    const isDiarizeTranscriptionModel = isDiarizeModel(model)
+    const useDiarized =
+      responseFormat === 'diarized_json' ||
+      (isDiarizeTranscriptionModel && responseFormat === undefined)
+    this.validateDiarizationOptions({
+      model,
+      prompt,
+      responseFormat,
+      modelOptions,
+    })
 
     // With exactOptionalPropertyTypes, vendor SDK request shapes reject
     // `T | undefined` in optional fields. Build the request incrementally and
     // only set optional fields when they're actually defined.
-    const responseFormatValue = this.mapResponseFormat(responseFormat)
-    const request: OpenAI_SDK.Audio.TranscriptionCreateParams = {
+    const responseFormatValue = useDiarized
+      ? 'diarized_json'
+      : this.mapResponseFormat(responseFormat)
+    const request: OpenAI_SDK.Audio.TranscriptionCreateParamsNonStreaming = {
       model,
       file,
       ...(modelOptions ?? {}),
@@ -68,6 +98,12 @@ export class OpenAITranscriptionAdapter<
     if (prompt !== undefined) {
       request.prompt = prompt
     }
+    if (
+      isDiarizeTranscriptionModel &&
+      modelOptions?.chunking_strategy === undefined
+    ) {
+      request.chunking_strategy = 'auto'
+    }
     if (responseFormatValue !== undefined) {
       request.response_format = responseFormatValue
     }
@@ -75,14 +111,38 @@ export class OpenAITranscriptionAdapter<
     // Only Whisper supports verbose_json. The gpt-4o-* transcribe models
     // accept only json/text and reject verbose_json with HTTP 400.
     const useVerbose =
-      responseFormat === 'verbose_json' ||
+      (!useDiarized && responseFormat === 'verbose_json') ||
       (!responseFormat && model === 'whisper-1')
 
     try {
       options.logger.request(
-        `activity=transcription provider=${this.name} model=${model} verbose=${useVerbose}`,
+        `activity=transcription provider=${this.name} model=${model} verbose=${useVerbose} diarized=${useDiarized}`,
         { provider: this.name, model },
       )
+      if (useDiarized) {
+        const response = (await this.client.audio.transcriptions.create(
+          request,
+        )) as OpenAI_SDK.Audio.TranscriptionDiarized
+
+        const segments = response.segments.map(
+          (segment, index): TranscriptionSegment => ({
+            id: mapDiarizedSegmentId(segment.id, index),
+            start: segment.start,
+            end: segment.end,
+            text: segment.text,
+            speaker: segment.speaker,
+          }),
+        )
+
+        return {
+          id: generateId(this.name),
+          model,
+          text: response.text,
+          duration: response.duration,
+          ...(segments.length > 0 && { segments }),
+        }
+      }
+
       if (useVerbose) {
         const response = (await this.client.audio.transcriptions.create({
           ...request,
@@ -118,15 +178,15 @@ export class OpenAITranscriptionAdapter<
           ...(segments !== undefined && { segments }),
           ...(words !== undefined && { words }),
         }
-      } else {
-        const response = await this.client.audio.transcriptions.create(request)
+      }
 
-        return {
-          id: generateId(this.name),
-          model,
-          text: typeof response === 'string' ? response : response.text,
-          ...(language !== undefined && { language }),
-        }
+      const response = await this.client.audio.transcriptions.create(request)
+
+      return {
+        id: generateId(this.name),
+        model,
+        text: typeof response === 'string' ? response : response.text,
+        ...(language !== undefined && { language }),
       }
     } catch (error: unknown) {
       options.logger.errors(`${this.name}.transcribe fatal`, {
@@ -182,8 +242,81 @@ export class OpenAITranscriptionAdapter<
     }
   }
 
+  private validateDiarizationOptions({
+    model,
+    prompt,
+    responseFormat,
+    modelOptions,
+  }: Pick<
+    TranscriptionOptions<OpenAITranscriptionProviderOptions>,
+    'model' | 'prompt' | 'responseFormat' | 'modelOptions'
+  >): void {
+    const isDiarizeTranscriptionModel = isDiarizeModel(model)
+
+    if (
+      !isDiarizeTranscriptionModel &&
+      (responseFormat === 'diarized_json' ||
+        modelOptions?.known_speaker_names !== undefined ||
+        modelOptions?.known_speaker_references !== undefined)
+    ) {
+      throw new Error(
+        'OpenAI speaker diarization options are only supported with OpenAI diarization transcription models.',
+      )
+    }
+
+    if (!isDiarizeTranscriptionModel) return
+
+    if (prompt !== undefined) {
+      throw new Error(
+        'OpenAI diarization transcription models do not support prompts.',
+      )
+    }
+
+    if (modelOptions?.include !== undefined) {
+      throw new Error(
+        'OpenAI diarization transcription models do not support the include option.',
+      )
+    }
+
+    if (modelOptions?.timestamp_granularities !== undefined) {
+      throw new Error(
+        'OpenAI diarization transcription models do not support timestamp_granularities.',
+      )
+    }
+
+    if (modelOptions?.known_speaker_names !== undefined) {
+      const knownSpeakerCount = modelOptions.known_speaker_names.length
+      if (knownSpeakerCount > 4) {
+        throw new Error(
+          'OpenAI diarization transcription models support at most 4 known speaker names.',
+        )
+      }
+    }
+
+    if (modelOptions?.known_speaker_references !== undefined) {
+      const knownSpeakerReferenceCount =
+        modelOptions.known_speaker_references.length
+      if (knownSpeakerReferenceCount > 4) {
+        throw new Error(
+          'OpenAI diarization transcription models support at most 4 known speaker references.',
+        )
+      }
+    }
+
+    if (
+      modelOptions?.known_speaker_names !== undefined &&
+      modelOptions.known_speaker_references !== undefined &&
+      modelOptions.known_speaker_names.length !==
+        modelOptions.known_speaker_references.length
+    ) {
+      throw new Error(
+        'OpenAI diarization known_speaker_names and known_speaker_references must have matching lengths.',
+      )
+    }
+  }
+
   protected mapResponseFormat(
-    format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt',
+    format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' | 'diarized_json',
   ): OpenAI_SDK.Audio.TranscriptionCreateParams['response_format'] {
     if (!format) return 'json'
     return format
diff --git a/packages/ai-openai/src/audio/transcription-provider-options.ts b/packages/ai-openai/src/audio/transcription-provider-options.ts
index 17f619cb0..befa5df6a 100644
--- a/packages/ai-openai/src/audio/transcription-provider-options.ts
+++ b/packages/ai-openai/src/audio/transcription-provider-options.ts
@@ -38,4 +38,14 @@ export interface OpenAITranscriptionProviderOptions {
    * Optional list of audio samples (as data URLs) that contain known speaker references matching known_speaker_names[]. Each sample must be between 2 and 10 seconds, and can use any of the same input audio formats supported by file.
    */
   known_speaker_references?: Array<string>
+  /**
+   * Controls how the audio is cut into chunks. Required by OpenAI when
+   * `gpt-4o-transcribe-diarize` input is longer than 30 seconds. Use `"auto"`
+   * for the service-managed VAD strategy, or pass a `server_vad` config to tune
+   * segmentation.
+   */
+  chunking_strategy?:
+    | 'auto'
+    | OpenAI.Audio.TranscriptionCreateParams.VadConfig
+    | null
 }
diff --git a/packages/ai-openai/tests/transcription-adapter.test.ts b/packages/ai-openai/tests/transcription-adapter.test.ts
new file mode 100644
index 000000000..fc68d3b86
--- /dev/null
+++ b/packages/ai-openai/tests/transcription-adapter.test.ts
@@ -0,0 +1,300 @@
+import { describe, expect, it, vi } from 'vitest'
+import { resolveDebugOption } from '@tanstack/ai/adapter-internals'
+import {
+  OpenAITranscriptionAdapter,
+  createOpenaiTranscription,
+} from '../src/adapters/transcription'
+import type OpenAI from 'openai'
+import type { OpenAITranscriptionModel } from '../src/model-meta'
+
+const testLogger = resolveDebugOption(false)
+
+class TestOpenAITranscriptionAdapter<
+  TModel extends OpenAITranscriptionModel,
+> extends OpenAITranscriptionAdapter<TModel> {
+  spyOnTranscriptionsCreate() {
+    return vi.spyOn(this.client.audio.transcriptions, 'create')
+  }
+}
+
+describe('OpenAI transcription adapter', () => {
+  it('creates a diarization-capable adapter', () => {
+    const adapter = createOpenaiTranscription(
+      'gpt-4o-transcribe-diarize',
+      'test-api-key',
+    )
+
+    expect(adapter).toBeInstanceOf(OpenAITranscriptionAdapter)
+    expect(adapter.name).toBe('openai')
+  })
+
+  it('defaults the diarization model to diarized_json with automatic chunking', async () => {
+    const mockResponse: OpenAI.Audio.TranscriptionDiarized = {
+      text: 'Agent: Hello\nCustomer: Hi',
+      duration: 2.2,
+      task: 'transcribe',
+      segments: [
+        {
+          id: 'seg_0',
+          type: 'transcript.text.segment',
+          start: 0,
+          end: 1.4,
+          text: 'Hello',
+          speaker: 'agent',
+        },
+        {
+          id: 'seg_1',
+          type: 'transcript.text.segment',
+          start: 1.5,
+          end: 2.2,
+          text: 'Hi',
+          speaker: 'customer',
+        },
+      ],
+    }
+    const adapter = new TestOpenAITranscriptionAdapter(
+      { apiKey: 'test-api-key' },
+      'gpt-4o-transcribe-diarize',
+    )
+    const mockCreate = adapter
+      .spyOnTranscriptionsCreate()
+      .mockResolvedValueOnce(mockResponse)
+
+    const result = await adapter.transcribe({
+      model: 'gpt-4o-transcribe-diarize',
+      audio: new File([], 'meeting.wav', { type: 'audio/wav' }),
+      logger: testLogger,
+    })
+
+    expect(mockCreate).toHaveBeenCalledWith(
+      expect.objectContaining({
+        model: 'gpt-4o-transcribe-diarize',
+        response_format: 'diarized_json',
+        chunking_strategy: 'auto',
+      }),
+    )
+    expect(result.text).toBe('Agent: Hello\nCustomer: Hi')
+    expect(result.segments).toEqual([
+      {
+        id: 0,
+        start: 0,
+        end: 1.4,
+        text: 'Hello',
+        speaker: 'agent',
+      },
+      {
+        id: 1,
+        start: 1.5,
+        end: 2.2,
+        text: 'Hi',
+        speaker: 'customer',
+      },
+    ])
+  })
+
+  it('passes explicit diarization chunking and known speaker references', async () => {
+    const mockResponse: OpenAI.Audio.TranscriptionDiarized = {
+      text: 'Speaker text',
+      duration: 1,
+      task: 'transcribe',
+      segments: [
+        {
+          id: 'speaker-intro',
+          type: 'transcript.text.segment',
+          start: 0,
+          end: 1,
+          text: 'Speaker text',
+          speaker: 'agent',
+        },
+      ],
+    }
+    const adapter = new TestOpenAITranscriptionAdapter(
+      { apiKey: 'test-api-key' },
+      'gpt-4o-transcribe-diarize',
+    )
+    const mockCreate = adapter
+      .spyOnTranscriptionsCreate()
+      .mockResolvedValueOnce(mockResponse)
+
+    const result = await adapter.transcribe({
+      model: 'gpt-4o-transcribe-diarize',
+      audio: new File([], 'meeting.wav', { type: 'audio/wav' }),
+      responseFormat: 'diarized_json',
+      modelOptions: {
+        chunking_strategy: {
+          type: 'server_vad',
+          threshold: 0.5,
+          prefix_padding_ms: 300,
+          silence_duration_ms: 500,
+        },
+        known_speaker_names: ['agent'],
+        known_speaker_references: ['data:audio/wav;base64,AAA='],
+      },
+      logger: testLogger,
+    })
+
+    expect(mockCreate).toHaveBeenCalledWith(
+      expect.objectContaining({
+        response_format: 'diarized_json',
+        chunking_strategy: {
+          type: 'server_vad',
+          threshold: 0.5,
+          prefix_padding_ms: 300,
+          silence_duration_ms: 500,
+        },
+        known_speaker_names: ['agent'],
+        known_speaker_references: ['data:audio/wav;base64,AAA='],
+      }),
+    )
+    expect(result.segments?.[0]?.id).toBe(0)
+  })
+
+  it('respects explicit null chunking for short diarization inputs', async () => {
+    const mockResponse: OpenAI.Audio.TranscriptionDiarized = {
+      text: 'Hello',
+      duration: 1,
+      task: 'transcribe',
+      segments: [],
+    }
+    const adapter = new TestOpenAITranscriptionAdapter(
+      { apiKey: 'test-api-key' },
+      'gpt-4o-transcribe-diarize',
+    )
+    const mockCreate = adapter
+      .spyOnTranscriptionsCreate()
+      .mockResolvedValueOnce(mockResponse)
+
+    await adapter.transcribe({
+      model: 'gpt-4o-transcribe-diarize',
+      audio: new File([], 'short.wav', { type: 'audio/wav' }),
+      modelOptions: {
+        chunking_strategy: null,
+      },
+      logger: testLogger,
+    })
+
+    expect(mockCreate).toHaveBeenCalledWith(
+      expect.objectContaining({
+        chunking_strategy: null,
+      }),
+    )
+  })
+
+  it('allows json or text response formats for the diarization model', async () => {
+    const mockResponse: OpenAI.Audio.Transcription = {
+      text: 'Hello',
+    }
+    const adapter = new TestOpenAITranscriptionAdapter(
+      { apiKey: 'test-api-key' },
+      'gpt-4o-transcribe-diarize',
+    )
+    const mockCreate = adapter
+      .spyOnTranscriptionsCreate()
+      .mockResolvedValueOnce(mockResponse)
+
+    const result = await adapter.transcribe({
+      model: 'gpt-4o-transcribe-diarize',
+      audio: new File([], 'short.wav', { type: 'audio/wav' }),
+      responseFormat: 'json',
+      logger: testLogger,
+    })
+
+    expect(mockCreate).toHaveBeenCalledWith(
+      expect.objectContaining({
+        response_format: 'json',
+        chunking_strategy: 'auto',
+      }),
+    )
+    expect(result).toMatchObject({
+      model: 'gpt-4o-transcribe-diarize',
+      text: 'Hello',
+    })
+  })
+
+  it('rejects diarized_json with non-diarization models', async () => {
+    const adapter = new TestOpenAITranscriptionAdapter(
+      { apiKey: 'test-api-key' },
+      'whisper-1',
+    )
+
+    await expect(
+      adapter.transcribe({
+        model: 'whisper-1',
+        audio: new File([], 'audio.wav', { type: 'audio/wav' }),
+        responseFormat: 'diarized_json',
+        logger: testLogger,
+      }),
+    ).rejects.toThrow('speaker diarization options')
+  })
+
+  it('rejects unsupported diarization prompt and timestamp options', async () => {
+    const adapter = new TestOpenAITranscriptionAdapter(
+      { apiKey: 'test-api-key' },
+      'gpt-4o-transcribe-diarize',
+    )
+
+    await expect(
+      adapter.transcribe({
+        model: 'gpt-4o-transcribe-diarize',
+        audio: new File([], 'audio.wav', { type: 'audio/wav' }),
+        prompt: 'Use product vocabulary',
+        logger: testLogger,
+      }),
+    ).rejects.toThrow('do not support prompts')
+
+    await expect(
+      adapter.transcribe({
+        model: 'gpt-4o-transcribe-diarize',
+        audio: new File([], 'audio.wav', { type: 'audio/wav' }),
+        modelOptions: {
+          timestamp_granularities: ['word'],
+        },
+        logger: testLogger,
+      }),
+    ).rejects.toThrow('timestamp_granularities')
+  })
+
+  it('rejects unsupported diarization include and too many known speakers', async () => {
+    const adapter = new TestOpenAITranscriptionAdapter(
+      { apiKey: 'test-api-key' },
+      'gpt-4o-transcribe-diarize',
+    )
+
+    await expect(
+      adapter.transcribe({
+        model: 'gpt-4o-transcribe-diarize',
+        audio: new File([], 'audio.wav', { type: 'audio/wav' }),
+        modelOptions: {
+          include: ['logprobs'],
+        },
+        logger: testLogger,
+      }),
+    ).rejects.toThrow('include')
+
+    await expect(
+      adapter.transcribe({
+        model: 'gpt-4o-transcribe-diarize',
+        audio: new File([], 'audio.wav', { type: 'audio/wav' }),
+        modelOptions: {
+          known_speaker_names: ['a', 'b', 'c', 'd', 'e'],
+        },
+        logger: testLogger,
+      }),
+    ).rejects.toThrow('at most 4')
+
+    await expect(
+      adapter.transcribe({
+        model: 'gpt-4o-transcribe-diarize',
+        audio: new File([], 'audio.wav', { type: 'audio/wav' }),
+        modelOptions: {
+          known_speaker_names: ['agent'],
+          known_speaker_references: [
+            'data:audio/wav;base64,AAA=',
+            'data:audio/wav;base64,BBB=',
+          ],
+        },
+        logger: testLogger,
+      }),
+    ).rejects.toThrow('matching lengths')
+  })
+})
diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md
index 09a552b73..239e29308 100644
--- a/packages/ai/skills/ai-core/media-generation/SKILL.md
+++ b/packages/ai/skills/ai-core/media-generation/SKILL.md
@@ -259,7 +259,7 @@ const { generate, result, isLoading } = useGenerateSpeech({
 ### 4. Audio Transcription
 
 Adapter: `openaiTranscription` (whisper-1, gpt-4o-transcribe,
-gpt-4o-mini-transcribe).
+gpt-4o-mini-transcribe, gpt-4o-transcribe-diarize).
 
 ```typescript
 import { generateTranscription } from '@tanstack/ai'
@@ -271,7 +271,7 @@ const result = await generateTranscription({
   language: 'en',
   responseFormat: 'verbose_json',
   modelOptions: {
-    include: ['segment', 'word'],
+    timestamp_granularities: ['word', 'segment'],
   },
 })
 
@@ -281,6 +281,10 @@ const result = await generateTranscription({
 // result.segments   -- timestamped segments with optional word-level timestamps
 ```
 
+For speaker diarization, use `openaiTranscription('gpt-4o-transcribe-diarize')`.
+It defaults to `responseFormat: 'diarized_json'` and `chunking_strategy: 'auto'`;
+do not pass `prompt`, `include`, or `timestamp_granularities` with this model.
+
 Client hook:
 
 ```tsx
diff --git a/packages/ai/src/activities/generateTranscription/index.ts b/packages/ai/src/activities/generateTranscription/index.ts
index 90262e9e9..9705d57b0 100644
--- a/packages/ai/src/activities/generateTranscription/index.ts
+++ b/packages/ai/src/activities/generateTranscription/index.ts
@@ -59,7 +59,13 @@ export interface TranscriptionActivityOptions<
   /** An optional prompt to guide the transcription */
   prompt?: string
   /** The format of the transcription output */
-  responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'
+  responseFormat?:
+    | 'json'
+    | 'text'
+    | 'srt'
+    | 'verbose_json'
+    | 'vtt'
+    | 'diarized_json'
   /** Provider-specific options for transcription */
   modelOptions?: TranscriptionProviderOptions<TAdapter>
   /**
diff --git a/packages/ai/src/types.ts b/packages/ai/src/types.ts
index 6c596a2dc..01c768a9d 100644
--- a/packages/ai/src/types.ts
+++ b/packages/ai/src/types.ts
@@ -1690,7 +1690,13 @@ export interface TranscriptionOptions<
   /** An optional prompt to guide the transcription */
   prompt?: string
   /** The format of the transcription output */
-  responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'
+  responseFormat?:
+    | 'json'
+    | 'text'
+    | 'srt'
+    | 'verbose_json'
+    | 'vtt'
+    | 'diarized_json'
   /** Model-specific options for transcription */
   modelOptions?: TProviderOptions
   /**

From 05dfb5360e845db5abe6d3bbe926aefd78c6943c Mon Sep 17 00:00:00 2001
From: 8times4 <46720448+8times4@users.noreply.github.com>
Date: Thu, 28 May 2026 13:08:58 +0200
Subject: [PATCH 2/2] fix coderabbit recommendations

---
 .../ai-openai/src/adapters/transcription.ts   | 35 ++++++++++++++++---
 .../tests/transcription-adapter.test.ts       | 20 +++++++++++
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/packages/ai-openai/src/adapters/transcription.ts b/packages/ai-openai/src/adapters/transcription.ts
index 44cfc06fb..892f7a414 100644
--- a/packages/ai-openai/src/adapters/transcription.ts
+++ b/packages/ai-openai/src/adapters/transcription.ts
@@ -14,8 +14,12 @@ import type { OpenAITranscriptionProviderOptions } from '../audio/transcription-
 import type { OpenAIClientConfig } from '../utils/client'
 
 const DIARIZE_MODELS = ['gpt-4o-transcribe-diarize'] as const
+const DIARIZE_RESPONSE_FORMATS = ['json', 'text', 'diarized_json'] as const
 
 type DiarizeModel = (typeof DIARIZE_MODELS)[number]
+type OpenAITranscriptionResponseFormat = NonNullable<
+  TranscriptionOptions<OpenAITranscriptionProviderOptions>['responseFormat']
+>
 
 function isDiarizeModel(model: string): model is DiarizeModel {
   return DIARIZE_MODELS.includes(model as DiarizeModel)
@@ -104,9 +108,7 @@ export class OpenAITranscriptionAdapter<
     ) {
       request.chunking_strategy = 'auto'
     }
-    if (responseFormatValue !== undefined) {
-      request.response_format = responseFormatValue
-    }
+    request.response_format = responseFormatValue
 
     // Only Whisper supports verbose_json. The gpt-4o-* transcribe models
     // accept only json/text and reject verbose_json with HTTP 400.
@@ -266,6 +268,29 @@ export class OpenAITranscriptionAdapter<
 
     if (!isDiarizeTranscriptionModel) return
 
+    const modelOptionsResponseFormat = (
+      modelOptions as
+        | { responseFormat?: OpenAITranscriptionResponseFormat }
+        | undefined
+    )?.responseFormat
+    const requestedResponseFormats = [
+      this.mapResponseFormat(responseFormat),
+      ...(modelOptionsResponseFormat !== undefined
+        ? [this.mapResponseFormat(modelOptionsResponseFormat)]
+        : []),
+    ]
+    const unsupportedResponseFormat = requestedResponseFormats.find(
+      (format) =>
+        !DIARIZE_RESPONSE_FORMATS.includes(
+          format as (typeof DIARIZE_RESPONSE_FORMATS)[number],
+        ),
+    )
+    if (unsupportedResponseFormat !== undefined) {
+      throw new Error(
+        'OpenAI diarization transcription models only support json, text, and diarized_json response formats.',
+      )
+    }
+
     if (prompt !== undefined) {
       throw new Error(
         'OpenAI diarization transcription models do not support prompts.',
@@ -316,8 +341,8 @@ export class OpenAITranscriptionAdapter<
   }
 
   protected mapResponseFormat(
-    format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' | 'diarized_json',
-  ): OpenAI_SDK.Audio.TranscriptionCreateParams['response_format'] {
+    format?: OpenAITranscriptionResponseFormat,
+  ): OpenAITranscriptionResponseFormat {
     if (!format) return 'json'
     return format
   }
diff --git a/packages/ai-openai/tests/transcription-adapter.test.ts b/packages/ai-openai/tests/transcription-adapter.test.ts
index fc68d3b86..993bcb135 100644
--- a/packages/ai-openai/tests/transcription-adapter.test.ts
+++ b/packages/ai-openai/tests/transcription-adapter.test.ts
@@ -211,6 +211,26 @@ describe('OpenAI transcription adapter', () => {
     })
   })
 
+  it('rejects unsupported response formats for the diarization model', async () => {
+    const adapter = new TestOpenAITranscriptionAdapter(
+      { apiKey: 'test-api-key' },
+      'gpt-4o-transcribe-diarize',
+    )
+
+    for (const responseFormat of ['srt', 'vtt', 'verbose_json'] as const) {
+      await expect(
+        adapter.transcribe({
+          model: 'gpt-4o-transcribe-diarize',
+          audio: new File([], 'audio.wav', { type: 'audio/wav' }),
+          responseFormat,
+          logger: testLogger,
+        }),
+      ).rejects.toThrow(
+        'diarization transcription models only support json, text, and diarized_json',
+      )
+    }
+  })
+
   it('rejects diarized_json with non-diarization models', async () => {
     const adapter = new TestOpenAITranscriptionAdapter(
       { apiKey: 'test-api-key' },