From 18b3e99e98adfa6e4c4aee5510ca910b634d53e4 Mon Sep 17 00:00:00 2001 From: Kilerd Chan Date: Fri, 8 May 2026 15:51:11 +0900 Subject: [PATCH 1/3] docs: add timestamped TTS stream reference --- .../text-to-speech-stream-with-timestamps.mdx | 192 +++++ api-reference/openapi.json | 746 +++++++++++++++++- .../core-features/text-to-speech.mdx | 13 +- docs.json | 1 + 4 files changed, 942 insertions(+), 10 deletions(-) create mode 100644 api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx diff --git a/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx b/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx new file mode 100644 index 0000000..54d1f65 --- /dev/null +++ b/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx @@ -0,0 +1,192 @@ +--- +openapi: post /v1/tts/stream/with-timestamp +title: "Text to Speech Stream with Timestamps" +description: "Stream generated speech and timestamp alignment events" +icon: "waveform-lines" +iconType: "solid" +--- + + + This endpoint returns `text/event-stream`. Each SSE `message` event contains + one JSON payload with a base64-encoded audio chunk. + + + + Use this endpoint when you need both progressive audio delivery and + text-to-audio alignment data, such as karaoke-style highlighting, word or + phrase progress indicators, captions synchronized to generated speech, or + timeline editing. + + +## How the Stream Works + +The response is a Server-Sent Events stream. Every event includes: + +| Field | Type | Description | +| -------------- | ---------------- | ------------------------------------------------------------------------------------------------------------- | +| `audio_base64` | `string` | One base64-encoded audio chunk. Concatenate all chunks in arrival order to reconstruct the complete audio. | +| `content` | `string` | The text covered by this event's generated audio chunk. Long input can be split into multiple content chunks. | +| `alignment` | `object \| null` | Timestamp alignment for this content chunk. Audio-only continuation events can return `null`. | + +When `latency` is set to `balanced`, long input can be split into several text chunks. Each text chunk may produce one non-null `alignment` event, followed by one or more audio-only events where `alignment` is `null`. + + + Collect every non-null `alignment` in stream order. Do not keep only the first + or last alignment event. + + +## Alignment Shape + +Each non-null `alignment` contains the generated audio duration and ordered timing segments: + +```json +{ + "alignment": { + "audio_duration": 16.24, + "segments": [ + { + "text": "Hello", + "start": 0, + "end": 0.42 + }, + { + "text": "world", + "start": 0.42, + "end": 0.86 + } + ] + } +} +``` + +`start` and `end` are measured in seconds from the start of that content chunk's generated audio. Use `audio_duration` to offset later chunks when you need a single global timeline. + +## Minimal Request + +```bash +curl --no-buffer --request POST \ + --url https://api.fish.audio/v1/tts/stream/with-timestamp \ + --header 'Authorization: Bearer ' \ + --header 'Content-Type: application/json' \ + --header 'model: s2-pro' \ + --data '{ + "text": "Hello! Welcome to Fish Audio.", + "reference_id": "model-id", + "format": "opus", + "latency": "balanced" + }' +``` + +## Parsing the Stream + +The stream payload uses standard SSE framing. Parse each `data:` line as JSON, append every decoded `audio_base64` chunk to your audio buffer, and store non-null alignments separately. + + + + + ```python + import base64 + import json + import requests + + response = requests.post( + "https://api.fish.audio/v1/tts/stream/with-timestamp", + headers={ + "Authorization": "Bearer ", + "Content-Type": "application/json", + "model": "s2-pro", + }, + json={ + "text": "Hello! Welcome to Fish Audio.", + "reference_id": "model-id", + "format": "opus", + "latency": "balanced", + }, + stream=True, + ) + + audio_chunks = [] + alignments = [] + + for line in response.iter_lines(decode_unicode=True): + if not line or not line.startswith("data: "): + continue + + event = json.loads(line.removeprefix("data: ")) + audio_chunks.append(base64.b64decode(event["audio_base64"])) + + if event["alignment"] is not None: + alignments.append(event["alignment"]) + + audio = b"".join(audio_chunks) + ``` + + + + + ```javascript + const response = await fetch( + "https://api.fish.audio/v1/tts/stream/with-timestamp", + { + method: "POST", + headers: { + Authorization: "Bearer ", + "Content-Type": "application/json", + model: "s2-pro", + }, + body: JSON.stringify({ + text: "Hello! Welcome to Fish Audio.", + reference_id: "model-id", + format: "opus", + latency: "balanced", + }), + } + ); + + const audioChunks = []; + const alignments = []; + const decoder = new TextDecoder(); + let buffer = ""; + + for await (const chunk of response.body) { + buffer += decoder.decode(chunk, { stream: true }); + const events = buffer.split("\n\n"); + buffer = events.pop() ?? ""; + + for (const eventText of events) { + const dataLine = eventText + .split("\n") + .find(line => line.startsWith("data: ")); + + if (!dataLine) continue; + + const event = JSON.parse(dataLine.slice(6)); + audioChunks.push(Buffer.from(event.audio_base64, "base64")); + + if (event.alignment !== null) { + alignments.push(event.alignment); + } + } + } + + const audio = Buffer.concat(audioChunks); + ``` + + + + +## Format Guidance + +For timestamped streaming, we recommend `opus` with the default 48 kHz sample rate when your client supports it. Opus is designed for streaming and gives the best balance of quality, latency, and bandwidth for this endpoint. + +`wav` and `pcm` avoid lossy codec artifacts and are straightforward to align, but they produce much larger payloads. Use them when you need uncompressed audio, direct sample-level processing, or a playback pipeline that already expects raw audio. + + + Use `mp3` only when broad playback compatibility is more important than the + cleanest streaming boundaries. MP3 encoding uses overlapping audio windows, so + this endpoint must flush complete sentence audio before emitting alignment + data. Around sentence boundaries, that flush can introduce a small quality + loss or discontinuity compared with `opus`. + + +This endpoint accepts the same TTS request fields as the [Text to Speech API](/api-reference/endpoint/openapi-v1/text-to-speech), including `reference_id`, `references`, `prosody`, `temperature`, `top_p`, `chunk_length`, `format`, and `latency`. diff --git a/api-reference/openapi.json b/api-reference/openapi.json index 2f8e019..4b0a3a1 100644 --- a/api-reference/openapi.json +++ b/api-reference/openapi.json @@ -2402,6 +2402,24 @@ "BearerAuth": [] } ], + "parameters": [ + { + "in": "header", + "name": "model", + "description": "Specify which TTS model to use. We recommend `s2-pro`.", + "required": true, + "schema": { + "default": "s2-pro", + "enum": [ + "s1", + "s2-pro" + ], + "title": "Model", + "type": "string" + }, + "deprecated": false + } + ], "requestBody": { "required": true, "content": { @@ -2538,6 +2556,28 @@ "tags": [ "OpenAPI v1" ], + "x-codeSamples": [ + { + "lang": "bash", + "label": "Single Speaker", + "source": "curl --request POST \\\n --url https://api.fish.audio/v1/tts \\\n --header 'Authorization: Bearer ' \\\n --header 'Content-Type: application/json' \\\n --header 'model: s2-pro' \\\n --data '{\n \"text\": \"Hello! Welcome to Fish Audio.\",\n \"reference_id\": \"model-id\",\n \"temperature\": 0.7,\n \"top_p\": 0.7,\n \"prosody\": {\n \"speed\": 1,\n \"volume\": 0,\n \"normalize_loudness\": true\n },\n \"chunk_length\": 300,\n \"normalize\": true,\n \"format\": \"mp3\",\n \"sample_rate\": 44100,\n \"mp3_bitrate\": 128,\n \"latency\": \"normal\",\n \"max_new_tokens\": 1024,\n \"repetition_penalty\": 1.2,\n \"min_chunk_length\": 50,\n \"condition_on_previous_chunks\": true,\n \"early_stop_threshold\": 1\n }'" + }, + { + "lang": "bash", + "label": "Multi Speaker (S2-Pro only)", + "source": "curl --request POST \\\n --url https://api.fish.audio/v1/tts \\\n --header 'Authorization: Bearer ' \\\n --header 'Content-Type: application/json' \\\n --header 'model: s2-pro' \\\n --data '{\n \"text\": \"<|speaker:0|>Hello!<|speaker:1|>Hi there!\",\n \"reference_id\": [\"speaker-a-id\", \"speaker-b-id\"],\n \"temperature\": 0.7,\n \"top_p\": 0.7,\n \"prosody\": {\n \"speed\": 1,\n \"volume\": 0,\n \"normalize_loudness\": true\n },\n \"chunk_length\": 300,\n \"normalize\": true,\n \"format\": \"mp3\",\n \"sample_rate\": 44100,\n \"mp3_bitrate\": 128,\n \"latency\": \"normal\",\n \"max_new_tokens\": 1024,\n \"repetition_penalty\": 1.2,\n \"min_chunk_length\": 50,\n \"condition_on_previous_chunks\": true,\n \"early_stop_threshold\": 1\n }'" + } + ] + } + }, + "/v1/tts/stream/with-timestamp": { + "post": { + "summary": "Text to Speech Stream with Timestamps", + "security": [ + { + "BearerAuth": [] + } + ], "parameters": [ { "in": "header", @@ -2545,25 +2585,664 @@ "description": "Specify which TTS model to use. We recommend `s2-pro`.", "required": true, "schema": { - "type": "string", "default": "s2-pro", "enum": [ "s1", "s2-pro" - ] + ], + "title": "Model", + "type": "string" + }, + "deprecated": false + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TTSRequest" + } + }, + "application/msgpack": { + "schema": { + "$ref": "#/components/schemas/TTSRequest" + } } } + }, + "responses": { + "200": { + "description": "Server-Sent Events stream. Each `message` event contains a JSON payload with one base64 audio chunk. Concatenate every `audio_base64` chunk in arrival order to reconstruct the complete audio. In balanced streaming, long input can be split into multiple text chunks. Each text chunk may produce a non-null `alignment` event, followed by one or more audio-only events for the same `content` where `alignment` is null. Clients should collect every non-null `alignment` in order instead of keeping only the first or last event.", + "headers": { + "Transfer-Encoding": { + "schema": { + "type": "string" + }, + "description": "chunked" + } + }, + "content": { + "text/event-stream": { + "schema": { + "description": "One Server-Sent Events message payload for streaming TTS with timestamps. Each event contains one audio chunk. Concatenate all `audio_base64` chunks in arrival order to reconstruct the complete audio. Long input may be split into multiple content chunks. Each chunk can have its own non-null `alignment`, followed by additional audio-only events for that chunk where `alignment` is null. Collect every non-null `alignment` in order.", + "examples": [ + { + "alignment": { + "audio_duration": 16.24, + "segments": [ + { + "end": 0.16, + "start": 0, + "text": "I" + }, + { + "end": 0.48, + "start": 0.16, + "text": "can't" + }, + { + "end": 0.8, + "start": 0.48, + "text": "believe" + }, + { + "end": 1.12, + "start": 0.8, + "text": "its" + }, + { + "end": 1.44, + "start": 1.2, + "text": "been" + }, + { + "end": 1.76, + "start": 1.44, + "text": "this" + }, + { + "end": 2.48, + "start": 1.76, + "text": "long" + }, + { + "end": 2.64, + "start": 2.56, + "text": "It" + }, + { + "end": 3.04, + "start": 2.72, + "text": "feels" + }, + { + "end": 3.28, + "start": 3.12, + "text": "like" + }, + { + "end": 4, + "start": 3.36, + "text": "forever" + }, + { + "end": 4.32, + "start": 4, + "text": "since" + }, + { + "end": 4.48, + "start": 4.32, + "text": "we" + }, + { + "end": 4.96, + "start": 4.48, + "text": "last" + }, + { + "end": 5.28, + "start": 4.96, + "text": "really" + }, + { + "end": 5.84, + "start": 5.28, + "text": "talked" + }, + { + "end": 6.24, + "start": 6, + "text": "Ive" + }, + { + "end": 6.64, + "start": 6.24, + "text": "missed" + }, + { + "end": 6.96, + "start": 6.64, + "text": "hearing" + }, + { + "end": 7.2, + "start": 6.96, + "text": "your" + }, + { + "end": 7.76, + "start": 7.2, + "text": "voice" + }, + { + "end": 7.92, + "start": 7.76, + "text": "your" + }, + { + "end": 8.48, + "start": 7.92, + "text": "stories" + }, + { + "end": 8.72, + "start": 8.48, + "text": "even" + }, + { + "end": 8.8, + "start": 8.72, + "text": "the" + }, + { + "end": 9.2, + "start": 8.8, + "text": "little" + }, + { + "end": 9.52, + "start": 9.2, + "text": "things" + }, + { + "end": 9.68, + "start": 9.52, + "text": "you" + }, + { + "end": 10, + "start": 9.68, + "text": "used" + }, + { + "end": 10.08, + "start": 10, + "text": "to" + }, + { + "end": 10.64, + "start": 10.08, + "text": "say" + }, + { + "end": 10.96, + "start": 10.64, + "text": "How" + }, + { + "end": 11.12, + "start": 10.96, + "text": "have" + }, + { + "end": 11.36, + "start": 11.12, + "text": "you" + }, + { + "end": 11.92, + "start": 11.36, + "text": "been" + }, + { + "end": 12.24, + "start": 12, + "text": "Ive" + }, + { + "end": 12.48, + "start": 12.24, + "text": "thought" + }, + { + "end": 12.8, + "start": 12.48, + "text": "about" + }, + { + "end": 13.2, + "start": 12.8, + "text": "calling" + }, + { + "end": 13.36, + "start": 13.2, + "text": "you" + }, + { + "end": 13.68, + "start": 13.36, + "text": "so" + }, + { + "end": 13.92, + "start": 13.68, + "text": "many" + }, + { + "end": 14.56, + "start": 13.92, + "text": "times" + }, + { + "end": 14.72, + "start": 14.56, + "text": "but" + }, + { + "end": 14.88, + "start": 14.72, + "text": "I" + }, + { + "end": 15.2, + "start": 14.88, + "text": "never" + }, + { + "end": 15.36, + "start": 15.2, + "text": "knew" + }, + { + "end": 15.6, + "start": 15.36, + "text": "where" + }, + { + "end": 15.6, + "start": 15.6, + "text": "to" + }, + { + "end": 16.24, + "start": 15.68, + "text": "start" + } + ] + }, + "audio_base64": "SUQzBAAAAAAA...", + "content": "I can’t believe it’s been this long. It feels like forever since we last really talked. I’ve missed hearing your voice, your stories, even the little things you used to say. How have you been? I’ve thought about calling you so many times, but I never knew where to start." + }, + { + "alignment": null, + "audio_base64": "//uSxOAAF...", + "content": "I can’t believe it’s been this long. It feels like forever since we last really talked. I’ve missed hearing your voice, your stories, even the little things you used to say. How have you been? I’ve thought about calling you so many times, but I never knew where to start." + }, + { + "alignment": { + "audio_duration": 10.48, + "segments": [ + { + "end": 0.8, + "start": 0.4, + "text": "Seeing" + }, + { + "end": 0.96, + "start": 0.8, + "text": "you" + }, + { + "end": 1.44, + "start": 0.96, + "text": "again" + }, + { + "end": 1.68, + "start": 1.44, + "text": "now" + }, + { + "end": 2.08, + "start": 1.68, + "text": "makes" + }, + { + "end": 2.24, + "start": 2.08, + "text": "me" + }, + { + "end": 2.8, + "start": 2.24, + "text": "realize" + }, + { + "end": 3.12, + "start": 2.8, + "text": "just" + }, + { + "end": 3.28, + "start": 3.12, + "text": "how" + }, + { + "end": 3.6, + "start": 3.28, + "text": "much" + }, + { + "end": 3.76, + "start": 3.6, + "text": "Ive" + }, + { + "end": 4.24, + "start": 3.84, + "text": "missed" + }, + { + "end": 4.56, + "start": 4.24, + "text": "you" + }, + { + "end": 4.8, + "start": 4.64, + "text": "We" + }, + { + "end": 5.04, + "start": 4.8, + "text": "have" + }, + { + "end": 5.36, + "start": 5.04, + "text": "so" + }, + { + "end": 5.76, + "start": 5.36, + "text": "much" + }, + { + "end": 5.76, + "start": 5.76, + "text": "to" + }, + { + "end": 6.16, + "start": 5.76, + "text": "catch" + }, + { + "end": 6.4, + "start": 6.16, + "text": "up" + }, + { + "end": 6.72, + "start": 6.4, + "text": "on" + }, + { + "end": 6.96, + "start": 6.8, + "text": "and" + }, + { + "end": 7.04, + "start": 6.96, + "text": "I" + }, + { + "end": 7.36, + "start": 7.04, + "text": "dont" + }, + { + "end": 7.6, + "start": 7.36, + "text": "even" + }, + { + "end": 7.84, + "start": 7.6, + "text": "know" + }, + { + "end": 8.08, + "start": 7.84, + "text": "which" + }, + { + "end": 8.4, + "start": 8.08, + "text": "part" + }, + { + "end": 8.48, + "start": 8.4, + "text": "of" + }, + { + "end": 8.72, + "start": 8.56, + "text": "my" + }, + { + "end": 8.96, + "start": 8.72, + "text": "life" + }, + { + "end": 9.12, + "start": 9.12, + "text": "to" + }, + { + "end": 9.44, + "start": 9.12, + "text": "tell" + }, + { + "end": 9.6, + "start": 9.44, + "text": "you" + }, + { + "end": 10, + "start": 9.6, + "text": "about" + }, + { + "end": 10.48, + "start": 10.08, + "text": "first" + } + ] + }, + "audio_base64": "//uSxImAl...", + "content": "Seeing you again now makes me realize just how much I’ve missed you. We have so much to catch up on, and I don’t even know which part of my life to tell you about first." + } + ], + "properties": { + "audio_base64": { + "description": "Base64 encoded audio chunk. Concatenate every chunk in event order to reconstruct the full audio.", + "title": "Audio Base64", + "type": "string" + }, + "content": { + "description": "Text content covered by this event's text chunk. Long input may be split into multiple content chunks in one stream.", + "title": "Content", + "type": "string" + }, + "alignment": { + "anyOf": [ + { + "$ref": "#/components/schemas/TTSTimestampAlignment" + }, + { + "type": "null" + } + ], + "description": "Timestamp information for this content chunk. Balanced streaming can produce multiple non-null alignments, one for each text chunk. Additional audio events for the same content chunk may return null." + } + }, + "required": [ + "audio_base64", + "content", + "alignment" + ], + "title": "TTSTimestampStreamEvent", + "type": "object" + }, + "examples": { + "first_event": { + "summary": "First text chunk event with alignment", + "value": "data: {\"audio_base64\": \"SUQzBAAAAAAA...\", \"content\": \"I can’t believe it’s been this long. It feels like forever since we last really talked. I’ve missed hearing your voice, your stories, even the little things you used to say. How have you been? I’ve thought about calling you so many times, but I never knew where to start.\", \"alignment\": {\"segments\": [{\"text\": \"I\", \"start\": 0.0, \"end\": 0.16}, {\"text\": \"can't\", \"start\": 0.16, \"end\": 0.48}, {\"text\": \"believe\", \"start\": 0.48, \"end\": 0.8}, {\"text\": \"its\", \"start\": 0.8, \"end\": 1.12}, {\"text\": \"been\", \"start\": 1.2, \"end\": 1.44}, {\"text\": \"this\", \"start\": 1.44, \"end\": 1.76}, {\"text\": \"long\", \"start\": 1.76, \"end\": 2.48}, {\"text\": \"It\", \"start\": 2.56, \"end\": 2.64}, {\"text\": \"feels\", \"start\": 2.72, \"end\": 3.04}, {\"text\": \"like\", \"start\": 3.12, \"end\": 3.28}, {\"text\": \"forever\", \"start\": 3.36, \"end\": 4.0}, {\"text\": \"since\", \"start\": 4.0, \"end\": 4.32}, {\"text\": \"we\", \"start\": 4.32, \"end\": 4.48}, {\"text\": \"last\", \"start\": 4.48, \"end\": 4.96}, {\"text\": \"really\", \"start\": 4.96, \"end\": 5.28}, {\"text\": \"talked\", \"start\": 5.28, \"end\": 5.84}, {\"text\": \"Ive\", \"start\": 6.0, \"end\": 6.24}, {\"text\": \"missed\", \"start\": 6.24, \"end\": 6.64}, {\"text\": \"hearing\", \"start\": 6.64, \"end\": 6.96}, {\"text\": \"your\", \"start\": 6.96, \"end\": 7.2}, {\"text\": \"voice\", \"start\": 7.2, \"end\": 7.76}, {\"text\": \"your\", \"start\": 7.76, \"end\": 7.92}, {\"text\": \"stories\", \"start\": 7.92, \"end\": 8.48}, {\"text\": \"even\", \"start\": 8.48, \"end\": 8.72}, {\"text\": \"the\", \"start\": 8.72, \"end\": 8.8}, {\"text\": \"little\", \"start\": 8.8, \"end\": 9.2}, {\"text\": \"things\", \"start\": 9.2, \"end\": 9.52}, {\"text\": \"you\", \"start\": 9.52, \"end\": 9.68}, {\"text\": \"used\", \"start\": 9.68, \"end\": 10.0}, {\"text\": \"to\", \"start\": 10.0, \"end\": 10.08}, {\"text\": \"say\", \"start\": 10.08, \"end\": 10.64}, {\"text\": \"How\", \"start\": 10.64, \"end\": 10.96}, {\"text\": \"have\", \"start\": 10.96, \"end\": 11.12}, {\"text\": \"you\", \"start\": 11.12, \"end\": 11.36}, {\"text\": \"been\", \"start\": 11.36, \"end\": 11.92}, {\"text\": \"Ive\", \"start\": 12.0, \"end\": 12.24}, {\"text\": \"thought\", \"start\": 12.24, \"end\": 12.48}, {\"text\": \"about\", \"start\": 12.48, \"end\": 12.8}, {\"text\": \"calling\", \"start\": 12.8, \"end\": 13.2}, {\"text\": \"you\", \"start\": 13.2, \"end\": 13.36}, {\"text\": \"so\", \"start\": 13.36, \"end\": 13.68}, {\"text\": \"many\", \"start\": 13.68, \"end\": 13.92}, {\"text\": \"times\", \"start\": 13.92, \"end\": 14.56}, {\"text\": \"but\", \"start\": 14.56, \"end\": 14.72}, {\"text\": \"I\", \"start\": 14.72, \"end\": 14.88}, {\"text\": \"never\", \"start\": 14.88, \"end\": 15.2}, {\"text\": \"knew\", \"start\": 15.2, \"end\": 15.36}, {\"text\": \"where\", \"start\": 15.36, \"end\": 15.6}, {\"text\": \"to\", \"start\": 15.6, \"end\": 15.6}, {\"text\": \"start\", \"start\": 15.68, \"end\": 16.24}], \"audio_duration\": 16.24}}\n\n" + }, + "following_event": { + "summary": "Following audio-only event for the same text chunk", + "value": "data: {\"audio_base64\": \"//uSxOAAF...\", \"content\": \"I can’t believe it’s been this long. It feels like forever since we last really talked. I’ve missed hearing your voice, your stories, even the little things you used to say. How have you been? I’ve thought about calling you so many times, but I never knew where to start.\", \"alignment\": null}\n\n" + }, + "later_text_chunk_event": { + "summary": "Later text chunk event with another alignment", + "value": "data: {\"audio_base64\": \"//uSxImAl...\", \"content\": \"Seeing you again now makes me realize just how much I’ve missed you. We have so much to catch up on, and I don’t even know which part of my life to tell you about first.\", \"alignment\": {\"segments\": [{\"text\": \"Seeing\", \"start\": 0.4, \"end\": 0.8}, {\"text\": \"you\", \"start\": 0.8, \"end\": 0.96}, {\"text\": \"again\", \"start\": 0.96, \"end\": 1.44}, {\"text\": \"now\", \"start\": 1.44, \"end\": 1.68}, {\"text\": \"makes\", \"start\": 1.68, \"end\": 2.08}, {\"text\": \"me\", \"start\": 2.08, \"end\": 2.24}, {\"text\": \"realize\", \"start\": 2.24, \"end\": 2.8}, {\"text\": \"just\", \"start\": 2.8, \"end\": 3.12}, {\"text\": \"how\", \"start\": 3.12, \"end\": 3.28}, {\"text\": \"much\", \"start\": 3.28, \"end\": 3.6}, {\"text\": \"Ive\", \"start\": 3.6, \"end\": 3.76}, {\"text\": \"missed\", \"start\": 3.84, \"end\": 4.24}, {\"text\": \"you\", \"start\": 4.24, \"end\": 4.56}, {\"text\": \"We\", \"start\": 4.64, \"end\": 4.8}, {\"text\": \"have\", \"start\": 4.8, \"end\": 5.04}, {\"text\": \"so\", \"start\": 5.04, \"end\": 5.36}, {\"text\": \"much\", \"start\": 5.36, \"end\": 5.76}, {\"text\": \"to\", \"start\": 5.76, \"end\": 5.76}, {\"text\": \"catch\", \"start\": 5.76, \"end\": 6.16}, {\"text\": \"up\", \"start\": 6.16, \"end\": 6.4}, {\"text\": \"on\", \"start\": 6.4, \"end\": 6.72}, {\"text\": \"and\", \"start\": 6.8, \"end\": 6.96}, {\"text\": \"I\", \"start\": 6.96, \"end\": 7.04}, {\"text\": \"dont\", \"start\": 7.04, \"end\": 7.36}, {\"text\": \"even\", \"start\": 7.36, \"end\": 7.6}, {\"text\": \"know\", \"start\": 7.6, \"end\": 7.84}, {\"text\": \"which\", \"start\": 7.84, \"end\": 8.08}, {\"text\": \"part\", \"start\": 8.08, \"end\": 8.4}, {\"text\": \"of\", \"start\": 8.4, \"end\": 8.48}, {\"text\": \"my\", \"start\": 8.56, \"end\": 8.72}, {\"text\": \"life\", \"start\": 8.72, \"end\": 8.96}, {\"text\": \"to\", \"start\": 9.12, \"end\": 9.12}, {\"text\": \"tell\", \"start\": 9.12, \"end\": 9.44}, {\"text\": \"you\", \"start\": 9.44, \"end\": 9.6}, {\"text\": \"about\", \"start\": 9.6, \"end\": 10.0}, {\"text\": \"first\", \"start\": 10.08, \"end\": 10.48}], \"audio_duration\": 10.48}}\n\n" + } + } + } + } + }, + "401": { + "description": "No permission -- see authorization schemes", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "402": { + "description": "No payment -- see charging schemes", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "422": { + "description": "", + "headers": {}, + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "loc": { + "title": "Location", + "description": "error field", + "type": "array", + "items": { + "type": "string" + } + }, + "type": { + "title": "Type", + "description": "error type", + "type": "string" + }, + "msg": { + "title": "Message", + "description": "error message", + "type": "string" + }, + "ctx": { + "title": "Context", + "description": "error context", + "type": "string" + }, + "in": { + "title": "In", + "type": "string", + "enum": [ + "path", + "query", + "header", + "cookie", + "body" + ] + } + }, + "required": [ + "loc", + "type", + "msg" + ] + } + } + } + } + } + }, + "tags": [ + "OpenAPI v1" ], "x-codeSamples": [ { "lang": "bash", - "label": "Single Speaker", - "source": "curl --request POST \\\n --url https://api.fish.audio/v1/tts \\\n --header 'Authorization: Bearer ' \\\n --header 'Content-Type: application/json' \\\n --header 'model: s2-pro' \\\n --data '{\n \"text\": \"Hello! Welcome to Fish Audio.\",\n \"reference_id\": \"model-id\",\n \"temperature\": 0.7,\n \"top_p\": 0.7,\n \"prosody\": {\n \"speed\": 1,\n \"volume\": 0,\n \"normalize_loudness\": true\n },\n \"chunk_length\": 300,\n \"normalize\": true,\n \"format\": \"mp3\",\n \"sample_rate\": 44100,\n \"mp3_bitrate\": 128,\n \"latency\": \"normal\",\n \"max_new_tokens\": 1024,\n \"repetition_penalty\": 1.2,\n \"min_chunk_length\": 50,\n \"condition_on_previous_chunks\": true,\n \"early_stop_threshold\": 1\n }'" - }, - { - "lang": "bash", - "label": "Multi Speaker (S2-Pro only)", - "source": "curl --request POST \\\n --url https://api.fish.audio/v1/tts \\\n --header 'Authorization: Bearer ' \\\n --header 'Content-Type: application/json' \\\n --header 'model: s2-pro' \\\n --data '{\n \"text\": \"<|speaker:0|>Hello!<|speaker:1|>Hi there!\",\n \"reference_id\": [\"speaker-a-id\", \"speaker-b-id\"],\n \"temperature\": 0.7,\n \"top_p\": 0.7,\n \"prosody\": {\n \"speed\": 1,\n \"volume\": 0,\n \"normalize_loudness\": true\n },\n \"chunk_length\": 300,\n \"normalize\": true,\n \"format\": \"mp3\",\n \"sample_rate\": 44100,\n \"mp3_bitrate\": 128,\n \"latency\": \"normal\",\n \"max_new_tokens\": 1024,\n \"repetition_penalty\": 1.2,\n \"min_chunk_length\": 50,\n \"condition_on_previous_chunks\": true,\n \"early_stop_threshold\": 1\n }'" + "label": "Stream With Timestamps", + "source": "curl --no-buffer --request POST \\\n --url https://api.fish.audio/v1/tts/stream/with-timestamp \\\n --header 'Authorization: Bearer ' \\\n --header 'Content-Type: application/json' \\\n --header 'model: s2-pro' \\\n --data '{\n \"text\": \"[happy] I can’t believe it’s been this long. It feels like forever since we last really talked. I’ve missed hearing your voice, your stories, even the little things you used to say. How have you been? I’ve thought about calling you so many times, but I never knew where to start. Seeing you again now makes me realize just how much I’ve missed you. We have so much to catch up on, and I don’t even know which part of my life to tell you about first.\",\n \"format\": \"opus\",\n \"normalize\": true,\n \"temperature\": 0.9,\n \"chunk_length\": 100,\n \"top_p\": 0.9,\n \"latency\": \"balanced\",\n \"sample_rate\": 48000,\n \"reference_id\": \"fbe02f8306fc4d3d915e9871722a39d5\"\n }'" } ] } @@ -3345,6 +4024,55 @@ "title": "TTSRequest", "type": "object" }, + "TTSTimestampAlignment": { + "properties": { + "segments": { + "description": "Ordered text timing segments for the generated audio.", + "items": { + "$ref": "#/components/schemas/TTSTimestampSegment" + }, + "title": "Segments", + "type": "array" + }, + "audio_duration": { + "description": "Audio duration in seconds for this alignment's content chunk.", + "title": "Audio Duration", + "type": "number" + } + }, + "required": [ + "segments", + "audio_duration" + ], + "title": "TTSTimestampAlignment", + "type": "object" + }, + "TTSTimestampSegment": { + "properties": { + "text": { + "description": "Text segment covered by this timing entry.", + "title": "Text", + "type": "string" + }, + "start": { + "description": "Segment start time in seconds.", + "title": "Start", + "type": "number" + }, + "end": { + "description": "Segment end time in seconds.", + "title": "End", + "type": "number" + } + }, + "required": [ + "text", + "start", + "end" + ], + "title": "TTSTimestampSegment", + "type": "object" + }, "ASRSegment": { "properties": { "text": { diff --git a/developer-guide/core-features/text-to-speech.mdx b/developer-guide/core-features/text-to-speech.mdx index 617d0b5..122c54d 100644 --- a/developer-guide/core-features/text-to-speech.mdx +++ b/developer-guide/core-features/text-to-speech.mdx @@ -430,6 +430,17 @@ Stream audio for real-time applications: +### Streaming with Timestamps + +Use the [Text to Speech Stream with Timestamps API](/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps) when you need generated audio and alignment data in the same stream. This endpoint returns Server-Sent Events where each event includes an `audio_base64` chunk, the matching text `content`, and optional `alignment` segments with start and end times. + + + Timestamped streaming is best for karaoke-style highlighting, synchronized + captions, phrase progress indicators, and timeline editing. For this endpoint, + prefer `opus` over `mp3` when possible because Opus provides cleaner streaming + boundaries for alignment. + + ## Adding Emotions @@ -643,4 +654,4 @@ Need help with text-to-speech? - [API Reference](/api-reference/introduction) - **Discord Community:** [Join our Discord](https://discord.gg/fish-audio) -- **Email Support:** support@fish.audio \ No newline at end of file +- **Email Support:** support@fish.audio diff --git a/docs.json b/docs.json index 88bb795..7b501ab 100644 --- a/docs.json +++ b/docs.json @@ -177,6 +177,7 @@ "icon": "code", "pages": [ "api-reference/endpoint/openapi-v1/text-to-speech", + "api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps", "api-reference/endpoint/openapi-v1/speech-to-text", "api-reference/endpoint/websocket/tts-live" ] From 9e5e8e477e973043da9fa6c11ff81c93ac0d1b58 Mon Sep 17 00:00:00 2001 From: Kilerd Chan Date: Fri, 8 May 2026 16:06:31 +0900 Subject: [PATCH 2/3] docs: clarify timestamped stream chunk handling --- .../text-to-speech-stream-with-timestamps.mdx | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx b/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx index 54d1f65..347fc87 100644 --- a/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx +++ b/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx @@ -175,6 +175,44 @@ The stream payload uses standard SSE framing. Parse each `data:` line as JSON, a +## Handling Split Content Chunks + +Long input can produce multiple `content` chunks. Treat audio and alignment as two related streams: + +1. Append every decoded `audio_base64` chunk in event order. Do this even when `alignment` is `null`. +2. Keep only non-null `alignment` objects for timing data. +3. Convert each alignment's local segment times into global times by adding the duration of all previous aligned content chunks. + + + `audio_base64` chunks are transport chunks, not sentence or word boundaries. + Do not try to align each audio chunk individually. Use `alignment.segments` + for text timing, and use `alignment.audio_duration` to offset later aligned + content chunks. + + +For example, if the first aligned content chunk has `audio_duration: 16.24`, add `16.24` seconds to every segment in the next non-null alignment before rendering it on the complete audio timeline. + +```javascript +function buildGlobalTimeline(alignments) { + const timeline = []; + let offsetSeconds = 0; + + for (const alignment of alignments) { + for (const segment of alignment.segments) { + timeline.push({ + text: segment.text, + start: segment.start + offsetSeconds, + end: segment.end + offsetSeconds, + }); + } + + offsetSeconds += alignment.audio_duration; + } + + return timeline; +} +``` + ## Format Guidance For timestamped streaming, we recommend `opus` with the default 48 kHz sample rate when your client supports it. Opus is designed for streaming and gives the best balance of quality, latency, and bandwidth for this endpoint. From 118cd1322db7dde5577d77ad4c5b56441614dcd6 Mon Sep 17 00:00:00 2001 From: Kilerd Chan Date: Fri, 8 May 2026 16:08:39 +0900 Subject: [PATCH 3/3] docs: add Python timestamp timeline sample --- .../text-to-speech-stream-with-timestamps.mdx | 63 +++++++++++++------ 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx b/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx index 347fc87..17fb687 100644 --- a/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx +++ b/api-reference/endpoint/openapi-v1/text-to-speech-stream-with-timestamps.mdx @@ -192,26 +192,53 @@ Long input can produce multiple `content` chunks. Treat audio and alignment as t For example, if the first aligned content chunk has `audio_duration: 16.24`, add `16.24` seconds to every segment in the next non-null alignment before rendering it on the complete audio timeline. -```javascript -function buildGlobalTimeline(alignments) { - const timeline = []; - let offsetSeconds = 0; - - for (const alignment of alignments) { - for (const segment of alignment.segments) { - timeline.push({ - text: segment.text, - start: segment.start + offsetSeconds, - end: segment.end + offsetSeconds, - }); - } + + - offsetSeconds += alignment.audio_duration; - } + ```python + def build_global_timeline(alignments): + timeline = [] + offset_seconds = 0.0 - return timeline; -} -``` + for alignment in alignments: + for segment in alignment["segments"]: + timeline.append({ + "text": segment["text"], + "start": segment["start"] + offset_seconds, + "end": segment["end"] + offset_seconds, + }) + + offset_seconds += alignment["audio_duration"] + + return timeline + ``` + + + + + ```javascript + function buildGlobalTimeline(alignments) { + const timeline = []; + let offsetSeconds = 0; + + for (const alignment of alignments) { + for (const segment of alignment.segments) { + timeline.push({ + text: segment.text, + start: segment.start + offsetSeconds, + end: segment.end + offsetSeconds, + }); + } + + offsetSeconds += alignment.audio_duration; + } + + return timeline; + } + ``` + + + ## Format Guidance