From b4e5e42260cd558e7e4dcd5d4661204d2b599815 Mon Sep 17 00:00:00 2001 From: Kilerd Chan Date: Thu, 7 May 2026 13:45:38 +0900 Subject: [PATCH 1/4] docs: add Japanese phoneme control guide --- .../core-features/fine-grained-control.mdx | 164 ++++++++++++++++-- 1 file changed, 147 insertions(+), 17 deletions(-) diff --git a/developer-guide/core-features/fine-grained-control.mdx b/developer-guide/core-features/fine-grained-control.mdx index 5700fd4..c316599 100644 --- a/developer-guide/core-features/fine-grained-control.mdx +++ b/developer-guide/core-features/fine-grained-control.mdx @@ -1,27 +1,29 @@ --- -title: 'Fine-grained Control' -description: 'Advanced control over speech generation' +title: "Fine-grained Control" +description: "Advanced control over speech generation" icon: "sliders" iconType: "solid" --- -import { AudioTranscript } from '/snippets/audio-transcript.jsx'; + +import { AudioTranscript } from "/snippets/audio-transcript.jsx"; {/* speak-mintlify-hash: 4a46ae86b04c62730f1554051768c47b306f2378a3624545d53de421d5d19acd */} + - ## Getting Started To use fine-grained control, you can use either our SDK, API, or Playground. -SDK/API: We recommend disabling normalization by setting `"normalize": false` in the request body. This ensures that the API doesn't alter the intonation of control tags. +SDK/API: Phoneme tags are preserved by text normalization, so you can keep the default normalization behavior for pronunciation control. Set `"normalize": false` only when you want to prevent normalization from rewriting the surrounding text, such as numbers, dates, or URLs. Playground: You can use V1.6 Control Model, without setting any other options. -Disabling normalization may reduce the stability of reading numbers, dates, and URLs. You'll need to handle these cases manually for best results. + Disabling normalization may reduce the stability of reading numbers, dates, + and URLs. You'll need to handle these cases manually for best results. ## Phoneme Control @@ -30,6 +32,7 @@ Phoneme control allows you to specify exact pronunciations for words or characte - CMU Arpabet (for English) - Pinyin (for Chinese) +- Japanese romaji phonemes with pitch accent markers To use phoneme control, wrap the desired pronunciation in `<|phoneme_start|>` and `<|phoneme_end|>` tags. Each tag should contain a single word or character. @@ -43,30 +46,157 @@ With phoneme control: "I am an `<|phoneme_start|>EH N JH AH N IH R<|phoneme_end| Standard: "我是一个工程师。" With phoneme control: "我是一个`<|phoneme_start|>gong1<|phoneme_end|><|phoneme_start|>cheng2<|phoneme_end|><|phoneme_start|>shi1<|phoneme_end|>`。" +### Japanese Example + +Japanese phoneme control uses OpenJTalk-style romaji phonemes plus pitch accent information. This is useful for Japanese homographs that have the same phoneme sequence but different pitch accents, such as `端が`, `箸が`, and `橋が`. + +Standard: "橋が見えます。" +With phoneme control: "`<|phoneme_start|>ha0shi1ga0<|phoneme_end|>`見えます。" + +For Japanese, put the pitch level digit immediately after each vowel-bearing mora: + +- `0` means the current mora is low. +- `1` means the current mora is high. +- Consonants are written without spaces before the vowel they belong to, for example `ha`, `shi`, and `ga`. +- Use OpenJTalk phoneme symbols such as `a`, `i`, `u`, `e`, `o`, `N`, `cl`, `ky`, `sh`, `ch`, and `ts`. + +The following examples all share the plain phoneme sequence `h a sh i g a`, but the pitch markers disambiguate the word: + +- `端が` (edge + subject marker): `<|phoneme_start|>ha0shi1ga1<|phoneme_end|>` +- `箸が` (chopsticks + subject marker): `<|phoneme_start|>ha1shi0ga0<|phoneme_end|>` +- `橋が` (bridge + subject marker): `<|phoneme_start|>ha0shi1ga0<|phoneme_end|>` + +If you are converting from a tool that emits ttslearn-style prosody symbols, do not put literal `[` or `]` inside the Fish Audio phoneme tag. Convert the rising edge to `0` on the low mora, the falling edge to `1` on the high mora, or use the `J`/`L` edge notation described below. + + + Japanese pitch accent depends on the dictionary, reading, and dialect. + Generate the phoneme string from the same text you send to TTS, then listen + and adjust the digits when you need a specific accent. + + +#### Generating Japanese Phonemes + +You can generate Japanese phoneme strings with `pyopenjtalk`. The converter below follows the full-context label approach used in the [ttslearn Japanese Tacotron recipe](https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html#%E3%83%95%E3%83%AB%E3%82%B3%E3%83%B3%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%83%A9%E3%83%99%E3%83%AB%E3%81%8B%E3%82%89%E3%81%AE%E9%9F%B3%E7%B4%A0%E5%88%97%E3%81%8A%E3%82%88%E3%81%B3%E9%9F%BB%E5%BE%8B%E8%A8%98%E5%8F%B7%E3%81%AE%E6%8A%BD%E5%87%BA), reads the `A`-series accent-position values from OpenJTalk full-context labels, and emits Fish Audio's digit format instead of the `[` and `]` prosody symbols shown there. + +```bash +pip install pyopenjtalk +``` + +```python +import re + +import pyopenjtalk + + +JAPANESE_VOWELS = {"a", "i", "u", "e", "o"} + + +def _numeric_feature(pattern: str, label: str, default: int = -1) -> int: + match = re.search(pattern, label) + return int(match.group(1)) if match else default + + +def japanese_to_romaji_with_accent( + sentence: str, + boundary: bool = False, + rise_edge: str = "J", + fall_edge: str = "L", +) -> str: + labels = pyopenjtalk.extract_fullcontext(sentence) + text = "" + level = -1 + + for index, label in enumerate(labels): + phoneme = re.search(r"\-([^\+]*)\+", label).group(1) + if phoneme in {"sil", "pau"}: + continue + + # OpenJTalk may emit uppercase vowels for devoiced vowels. + if phoneme in {"A", "I", "U", "E", "O"}: + phoneme = phoneme.lower() + + text += phoneme + + a1 = _numeric_feature(r"/A:(\-?[0-9]+)\+", label) + a2 = _numeric_feature(r"\+(\d+)\+", label) + a3 = _numeric_feature(r"\+(\d+)/", label) + + next_phoneme = re.search(r"\-([^\+]*)\+", labels[index + 1]).group(1) + if next_phoneme in {"sil", "pau"}: + a2_next = -1 + else: + a2_next = _numeric_feature(r"\+(\d+)\+", labels[index + 1]) + + if a3 == 1 and a2_next == 1: + if boundary and level >= 0: + text += " " + elif level >= 0: + text += str(level) + level = -1 + elif a1 == 0 and a2_next == a2 + 1: + level = 0 + text += fall_edge if boundary else "1" + elif a2 == 1 and a2_next == 2: + level = 1 + text += rise_edge if boundary else "0" + elif phoneme in JAPANESE_VOWELS: + if level < 0: + level = 0 + if not boundary: + text += str(level) + + return text + + +print(japanese_to_romaji_with_accent("橋が")) +# ha0shi1ga0 +print(japanese_to_romaji_with_accent("橋が", boundary=True)) +# haJshiLga +``` + +Then place the result inside the phoneme tags: + +```text +<|phoneme_start|>ha0shi1ga0<|phoneme_end|> +``` + +Minimal request body: + +```json +{ + "text": "<|phoneme_start|>ha0shi1ga0<|phoneme_end|>見えます。" +} +``` + +If your pipeline prefers edge markers instead of digits, set `boundary=True`. In that mode, `J` marks a rising edge and `L` marks a falling edge. Use one notation style consistently inside each phoneme tag. + ## Paralanguage Paralanguage controls allow you to add natural speech elements and pauses to make the generated speech sound more human-like. There are two main types of controls: ### Pause Words + You can use common pause words like "um", "uh", "嗯", "啊" to control the rhythm of the speech. ### Special Effects + The following special effects can be added using parentheses: -| Effect | Description | First Available | Stage | -|--------|-------------|-----------------|-------| -| `(break)` | Short pause | V1.6 | Experimental | -| `(long-break)` | Extended pause | V1.6 | Experimental | -| `(breath)` | Breathing sound | V1.6 | Experimental | -| `(laugh)` | Laughter sound | V1.6 | Experimental | -| `(cough)` | Coughing sound | V1.6 | Experimental | -| `(lip-smacking)` | Lip smacking sound | V1.6 | Experimental | -| `(sigh)` | Sighing sound | V1.6 | Experimental | +| Effect | Description | First Available | Stage | +| ---------------- | ------------------ | --------------- | ------------ | +| `(break)` | Short pause | V1.6 | Experimental | +| `(long-break)` | Extended pause | V1.6 | Experimental | +| `(breath)` | Breathing sound | V1.6 | Experimental | +| `(laugh)` | Laughter sound | V1.6 | Experimental | +| `(cough)` | Coughing sound | V1.6 | Experimental | +| `(lip-smacking)` | Lip smacking sound | V1.6 | Experimental | +| `(sigh)` | Sighing sound | V1.6 | Experimental | -The effects `(laugh)`, `(cough)`, `(lip-smacking)`, and `(sigh)` are developing. You may need to repeat them multiple times for better results. + The effects `(laugh)`, `(cough)`, `(lip-smacking)`, and `(sigh)` are + developing. You may need to repeat them multiple times for better results. Example: Standard: "I am an engineer." -With paralanguage: "I am, um, an (break) engineer." \ No newline at end of file +With paralanguage: "I am, um, an (break) engineer." From 661c57b7395e18cfb28c7466d847ae4859f58905 Mon Sep 17 00:00:00 2001 From: Kilerd Chan Date: Thu, 7 May 2026 14:02:43 +0900 Subject: [PATCH 2/4] docs: split fine-grained phoneme guides --- .../core-features/fine-grained-control.mdx | 191 ++++++------------ .../fine-grained-control/chinese.mdx | 98 +++++++++ .../fine-grained-control/english.mdx | 107 ++++++++++ .../fine-grained-control/japanese.mdx | 163 +++++++++++++++ docs.json | 11 +- llms.txt | 3 + 6 files changed, 438 insertions(+), 135 deletions(-) create mode 100644 developer-guide/core-features/fine-grained-control/chinese.mdx create mode 100644 developer-guide/core-features/fine-grained-control/english.mdx create mode 100644 developer-guide/core-features/fine-grained-control/japanese.mdx diff --git a/developer-guide/core-features/fine-grained-control.mdx b/developer-guide/core-features/fine-grained-control.mdx index c316599..1616f28 100644 --- a/developer-guide/core-features/fine-grained-control.mdx +++ b/developer-guide/core-features/fine-grained-control.mdx @@ -28,148 +28,63 @@ Playground: You can use V1.6 Control Model, without setting any other options. ## Phoneme Control -Phoneme control allows you to specify exact pronunciations for words or characters. Currently, we support: +Phoneme control allows you to specify exact pronunciations for words, characters, or short phrases. Wrap the desired pronunciation in `<|phoneme_start|>` and `<|phoneme_end|>` tags. + +The replacement scope depends on the language: + +- English: replace one word with CMU Arpabet or IPA. +- Chinese: replace one character or syllable with tone-number pinyin. +- Japanese: replace a short Japanese word or phrase with OpenJTalk-style romaji and pitch accent markers. + + + + CMU Arpabet and IPA examples for names, homographs, acronyms, and technical + terms. + + + + Tone-number pinyin examples for multi-character words, tones, and polyphonic + characters. + + + + OpenJTalk romaji phonemes with pitch accent digits or rising/falling edge + markers. + + + +### Quick Examples + +English: -- CMU Arpabet (for English) -- Pinyin (for Chinese) -- Japanese romaji phonemes with pitch accent markers - -To use phoneme control, wrap the desired pronunciation in `<|phoneme_start|>` and `<|phoneme_end|>` tags. Each tag should contain a single word or character. - -### English Example - -Standard: "I am an engineer." -With phoneme control: "I am an `<|phoneme_start|>EH N JH AH N IH R<|phoneme_end|>`." - -### Chinese Example - -Standard: "我是一个工程师。" -With phoneme control: "我是一个`<|phoneme_start|>gong1<|phoneme_end|><|phoneme_start|>cheng2<|phoneme_end|><|phoneme_start|>shi1<|phoneme_end|>`。" - -### Japanese Example - -Japanese phoneme control uses OpenJTalk-style romaji phonemes plus pitch accent information. This is useful for Japanese homographs that have the same phoneme sequence but different pitch accents, such as `端が`, `箸が`, and `橋が`. - -Standard: "橋が見えます。" -With phoneme control: "`<|phoneme_start|>ha0shi1ga0<|phoneme_end|>`見えます。" - -For Japanese, put the pitch level digit immediately after each vowel-bearing mora: - -- `0` means the current mora is low. -- `1` means the current mora is high. -- Consonants are written without spaces before the vowel they belong to, for example `ha`, `shi`, and `ga`. -- Use OpenJTalk phoneme symbols such as `a`, `i`, `u`, `e`, `o`, `N`, `cl`, `ky`, `sh`, `ch`, and `ts`. - -The following examples all share the plain phoneme sequence `h a sh i g a`, but the pitch markers disambiguate the word: - -- `端が` (edge + subject marker): `<|phoneme_start|>ha0shi1ga1<|phoneme_end|>` -- `箸が` (chopsticks + subject marker): `<|phoneme_start|>ha1shi0ga0<|phoneme_end|>` -- `橋が` (bridge + subject marker): `<|phoneme_start|>ha0shi1ga0<|phoneme_end|>` - -If you are converting from a tool that emits ttslearn-style prosody symbols, do not put literal `[` or `]` inside the Fish Audio phoneme tag. Convert the rising edge to `0` on the low mora, the falling edge to `1` on the high mora, or use the `J`/`L` edge notation described below. - - - Japanese pitch accent depends on the dictionary, reading, and dialect. - Generate the phoneme string from the same text you send to TTS, then listen - and adjust the digits when you need a specific accent. - - -#### Generating Japanese Phonemes - -You can generate Japanese phoneme strings with `pyopenjtalk`. The converter below follows the full-context label approach used in the [ttslearn Japanese Tacotron recipe](https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html#%E3%83%95%E3%83%AB%E3%82%B3%E3%83%B3%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%83%A9%E3%83%99%E3%83%AB%E3%81%8B%E3%82%89%E3%81%AE%E9%9F%B3%E7%B4%A0%E5%88%97%E3%81%8A%E3%82%88%E3%81%B3%E9%9F%BB%E5%BE%8B%E8%A8%98%E5%8F%B7%E3%81%AE%E6%8A%BD%E5%87%BA), reads the `A`-series accent-position values from OpenJTalk full-context labels, and emits Fish Audio's digit format instead of the `[` and `]` prosody symbols shown there. - -```bash -pip install pyopenjtalk -``` - -```python -import re - -import pyopenjtalk - - -JAPANESE_VOWELS = {"a", "i", "u", "e", "o"} - - -def _numeric_feature(pattern: str, label: str, default: int = -1) -> int: - match = re.search(pattern, label) - return int(match.group(1)) if match else default - - -def japanese_to_romaji_with_accent( - sentence: str, - boundary: bool = False, - rise_edge: str = "J", - fall_edge: str = "L", -) -> str: - labels = pyopenjtalk.extract_fullcontext(sentence) - text = "" - level = -1 - - for index, label in enumerate(labels): - phoneme = re.search(r"\-([^\+]*)\+", label).group(1) - if phoneme in {"sil", "pau"}: - continue - - # OpenJTalk may emit uppercase vowels for devoiced vowels. - if phoneme in {"A", "I", "U", "E", "O"}: - phoneme = phoneme.lower() - - text += phoneme - - a1 = _numeric_feature(r"/A:(\-?[0-9]+)\+", label) - a2 = _numeric_feature(r"\+(\d+)\+", label) - a3 = _numeric_feature(r"\+(\d+)/", label) - - next_phoneme = re.search(r"\-([^\+]*)\+", labels[index + 1]).group(1) - if next_phoneme in {"sil", "pau"}: - a2_next = -1 - else: - a2_next = _numeric_feature(r"\+(\d+)\+", labels[index + 1]) - - if a3 == 1 and a2_next == 1: - if boundary and level >= 0: - text += " " - elif level >= 0: - text += str(level) - level = -1 - elif a1 == 0 and a2_next == a2 + 1: - level = 0 - text += fall_edge if boundary else "1" - elif a2 == 1 and a2_next == 2: - level = 1 - text += rise_edge if boundary else "0" - elif phoneme in JAPANESE_VOWELS: - if level < 0: - level = 0 - if not boundary: - text += str(level) - - return text - - -print(japanese_to_romaji_with_accent("橋が")) -# ha0shi1ga0 -print(japanese_to_romaji_with_accent("橋が", boundary=True)) -# haJshiLga +```text +I am an <|phoneme_start|>EH1 N JH AH0 N IH1 R<|phoneme_end|>. ``` -Then place the result inside the phoneme tags: +Chinese: ```text -<|phoneme_start|>ha0shi1ga0<|phoneme_end|> +我是一个<|phoneme_start|>gong1<|phoneme_end|><|phoneme_start|>cheng2<|phoneme_end|><|phoneme_start|>shi1<|phoneme_end|>。 ``` -Minimal request body: +Japanese: -```json -{ - "text": "<|phoneme_start|>ha0shi1ga0<|phoneme_end|>見えます。" -} +```text +<|phoneme_start|>ha0shi1ga0<|phoneme_end|>見えます。 ``` -If your pipeline prefers edge markers instead of digits, set `boundary=True`. In that mode, `J` marks a rising edge and `L` marks a falling edge. Use one notation style consistently inside each phoneme tag. - ## Paralanguage Paralanguage controls allow you to add natural speech elements and pauses to make the generated speech sound more human-like. There are two main types of controls: @@ -198,5 +113,13 @@ The following special effects can be added using parentheses: Example: -Standard: "I am an engineer." -With paralanguage: "I am, um, an (break) engineer." + +```text +I am, um, an (break) engineer. +``` + +You can combine paralanguage and phoneme control in the same text: + +```text +I am, um, an (break) <|phoneme_start|>EH1 N JH AH0 N IH1 R<|phoneme_end|>. +``` diff --git a/developer-guide/core-features/fine-grained-control/chinese.mdx b/developer-guide/core-features/fine-grained-control/chinese.mdx new file mode 100644 index 0000000..892162e --- /dev/null +++ b/developer-guide/core-features/fine-grained-control/chinese.mdx @@ -0,0 +1,98 @@ +--- +title: "Chinese Phoneme Control" +description: "Control Chinese pronunciation with tone-number pinyin" +icon: "language" +--- + +## Overview + +Chinese phoneme control uses pinyin with tone numbers, also known as tone3 pinyin. Wrap one syllable in each `<|phoneme_start|>` and `<|phoneme_end|>` tag. + +```text +我是一个<|phoneme_start|>gong1<|phoneme_end|><|phoneme_start|>cheng2<|phoneme_end|><|phoneme_start|>shi1<|phoneme_end|>。 +``` + +This format is especially useful for polyphonic characters, names, and domain-specific terms where the default reading may be ambiguous. + +## Tone Numbers + +Put the tone number at the end of each pinyin syllable: + +| Tone | Example | Description | +| ---- | ------- | ----------- | +| 1 | `ma1` | High level | +| 2 | `ma2` | Rising | +| 3 | `ma3` | Dipping | +| 4 | `ma4` | Falling | +| 5 | `ma5` | Neutral | + +Use lowercase pinyin and keep punctuation outside the phoneme tag. + +## Multi-character Words + +For a multi-character word, place adjacent phoneme tags in the same order as the original characters: + +```text +Standard: 我是一个工程师。 +With phoneme control: 我是一个<|phoneme_start|>gong1<|phoneme_end|><|phoneme_start|>cheng2<|phoneme_end|><|phoneme_start|>shi1<|phoneme_end|>。 +``` + +You can also tag only the ambiguous character and leave the rest of the sentence unchanged: + +```text +请把这个字读作<|phoneme_start|>hang2<|phoneme_end|>。 +``` + +## Polyphonic Characters + +For polyphonic characters, choose the pinyin that matches the phrase meaning: + +```text +重庆: <|phoneme_start|>chong2<|phoneme_end|><|phoneme_start|>qing4<|phoneme_end|> +重要: <|phoneme_start|>zhong4<|phoneme_end|><|phoneme_start|>yao4<|phoneme_end|> +``` + +```text +银行: <|phoneme_start|>yin2<|phoneme_end|><|phoneme_start|>hang2<|phoneme_end|> +行走: <|phoneme_start|>xing2<|phoneme_end|><|phoneme_start|>zou3<|phoneme_end|> +``` + +```text +音乐: <|phoneme_start|>yin1<|phoneme_end|><|phoneme_start|>yue4<|phoneme_end|> +快乐: <|phoneme_start|>kuai4<|phoneme_end|><|phoneme_start|>le4<|phoneme_end|> +``` + +## Generate Pinyin + +The training pipeline uses the `pypinyin` dictionary and converts entries to tone3 pinyin. The helper below mirrors that behavior for single characters: + +```bash +pip install pypinyin +``` + +```python +from pypinyin.contrib.tone_convert import to_tone3 +from pypinyin.pinyin_dict import pinyin_dict + + +def chinese_char_to_pinyin(char: str) -> str | None: + pinyin = pinyin_dict.get(ord(char)) + if pinyin is None: + return None + if "," in pinyin: + raise ValueError(f"{char} has multiple readings; choose one manually") + return to_tone3(pinyin) + + +print(chinese_char_to_pinyin("工")) +# gong1 +``` + +Phrase-level words can require a phrase dictionary or manual selection. For example, `重` should be `chong2` in `重庆` but `zhong4` in `重要`. + +## Practical Tips + +- Use one phoneme tag per Chinese character or syllable. +- Keep Chinese punctuation, brackets, and spaces outside the tag. +- Choose readings manually for names and polyphonic characters. +- Use `ma5`-style tone 5 when you need to mark a neutral tone explicitly. diff --git a/developer-guide/core-features/fine-grained-control/english.mdx b/developer-guide/core-features/fine-grained-control/english.mdx new file mode 100644 index 0000000..6894a87 --- /dev/null +++ b/developer-guide/core-features/fine-grained-control/english.mdx @@ -0,0 +1,107 @@ +--- +title: "English Phoneme Control" +description: "Control English pronunciation with CMU Arpabet or IPA" +icon: "language" +--- + +## Overview + +English phoneme control supports two notation styles: + +- CMU Arpabet, the pronunciation format used by CMUdict. +- IPA, for workflows that already store pronunciations as International Phonetic Alphabet strings. + +Wrap the pronunciation for one word in `<|phoneme_start|>` and `<|phoneme_end|>`, and keep surrounding punctuation outside the tag. + +```text +I am an <|phoneme_start|>EH1 N JH AH0 N IH1 R<|phoneme_end|>. +``` + +## CMU Arpabet + +CMU Arpabet is written as space-separated uppercase symbols. Vowels can include stress digits: + +- `0` for unstressed vowels. +- `1` for primary stress. +- `2` for secondary stress. + +Example: + +```text +Standard: I am an engineer. +With phoneme control: I am an <|phoneme_start|>EH1 N JH AH0 N IH1 R<|phoneme_end|>. +``` + +You can omit stress digits when you only need a rough pronunciation, but CMUdict-style output with stress digits usually gives the model the clearest signal. + +## IPA + +IPA can be written as a continuous IPA string inside the same phoneme tags. Do not include dictionary slashes unless you intentionally want those characters spoken. + +```text +Standard: The data is ready. +With phoneme control: The <|phoneme_start|>ˈdeɪtə<|phoneme_end|> is ready. +``` + +Use one notation style inside each tag. For example, do not mix `AH0` and `ə` in the same phoneme span. + +## Common Examples + +Use phoneme control when spelling alone is ambiguous: + +```text +The <|phoneme_start|>R IY1 D<|phoneme_end|> endpoint returns the current state. +The book was <|phoneme_start|>R EH1 D<|phoneme_end|> yesterday. +``` + +```text +The <|phoneme_start|>B EY1 S<|phoneme_end|> line is too loud. +The <|phoneme_start|>B AE1 S<|phoneme_end|> swam upstream. +``` + +```text +The <|phoneme_start|>P OW1 L IH0 SH<|phoneme_end|> team joined the call. +Please <|phoneme_start|>P AA1 L IH0 SH<|phoneme_end|> the final mix. +``` + +Use it for product names, acronyms, and technical terms: + +```text +Deploy with <|phoneme_start|>K UW2 B ER0 N EH1 T IY0 Z<|phoneme_end|>. +The query uses <|phoneme_start|>EH1 S K Y UW1 EH1 L<|phoneme_end|>. +``` + +## Generate CMU Arpabet + +The training pipeline uses CMUdict-style pronunciations. You can generate the same format with the `cmudict` package: + +```bash +pip install cmudict +``` + +```python +import cmudict + + +entries = cmudict.dict() + + +def cmu_pronunciation(word: str) -> str | None: + phones = entries.get(word.lower()) + if not phones: + return None + return " ".join(phones[0]) + + +print(cmu_pronunciation("engineer")) +# EH1 N JH AH0 N IH1 R +``` + +CMUdict may contain multiple pronunciations for the same word. Listen to the result and choose the variant that matches your intended accent or context. + +## Practical Tips + +- Replace only the word whose pronunciation needs control. +- Strip punctuation before dictionary lookup, then place punctuation after the tag. +- Use CMU Arpabet for dictionary-derived pronunciations and IPA when your source material already uses IPA. +- For names and brands, write the pronunciation that you want the listener to hear, not necessarily the spelling. diff --git a/developer-guide/core-features/fine-grained-control/japanese.mdx b/developer-guide/core-features/fine-grained-control/japanese.mdx new file mode 100644 index 0000000..5697c0c --- /dev/null +++ b/developer-guide/core-features/fine-grained-control/japanese.mdx @@ -0,0 +1,163 @@ +--- +title: "Japanese Phoneme Control" +description: "Control Japanese pronunciation with romaji phonemes and pitch accent markers" +icon: "language" +--- + +## Overview + +Japanese phoneme control uses OpenJTalk-style romaji phonemes plus pitch accent information. This is useful for Japanese homographs that have the same plain phoneme sequence but different pitch accents, such as `端が`, `箸が`, and `橋が`. + +```text +Standard: 橋が見えます。 +With phoneme control: <|phoneme_start|>ha0shi1ga0<|phoneme_end|>見えます。 +``` + +Unlike Chinese, Japanese phoneme control is usually applied to a short word or phrase, not one tag per character. + +## Format + +Put the pitch level digit immediately after each vowel-bearing mora: + +- `0` means the current mora is low. +- `1` means the current mora is high. +- `N` can also carry a pitch digit. +- Consonants are written without spaces before the vowel they belong to, for example `ha`, `shi`, and `ga`. +- Use OpenJTalk phoneme symbols such as `a`, `i`, `u`, `e`, `o`, `N`, `cl`, `ky`, `sh`, `ch`, and `ts`. + +The following examples all share the plain phoneme sequence `h a sh i g a`, but the pitch markers disambiguate the word: + +- `端が` (edge + subject marker): `<|phoneme_start|>ha0shi1ga1<|phoneme_end|>` +- `箸が` (chopsticks + subject marker): `<|phoneme_start|>ha1shi0ga0<|phoneme_end|>` +- `橋が` (bridge + subject marker): `<|phoneme_start|>ha0shi1ga0<|phoneme_end|>` + + + Japanese pitch accent depends on the dictionary, reading, and dialect. + Generate the phoneme string from the same text you send to TTS, then listen + and adjust the digits when you need a specific accent. + + +## Relation to ttslearn Prosody Symbols + +The [ttslearn Japanese Tacotron recipe](https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html#%E3%83%95%E3%83%AB%E3%82%B3%E3%83%B3%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%83%A9%E3%83%99%E3%83%AB%E3%81%8B%E3%82%89%E3%81%AE%E9%9F%B3%E7%B4%A0%E5%88%97%E3%81%8A%E3%82%88%E3%81%B3%E9%9F%BB%E5%BE%8B%E8%A8%98%E5%8F%B7%E3%81%AE%E6%8A%BD%E5%87%BA) shows how to extract phonemes and prosody symbols from OpenJTalk full-context labels. That recipe prints symbols such as `[` for a pitch rise and `]` for a pitch fall. + +Fish Audio phoneme tags should not contain literal `[` or `]`. Convert that prosody into either: + +- Digit notation, such as `ha0shi1ga0`. +- Edge notation, such as `haJshiLga`, where `J` marks a rising edge and `L` marks a falling edge. + +Use one notation style consistently inside each phoneme tag. + +## Generate Japanese Phonemes + +You can generate Japanese phoneme strings with `pyopenjtalk`. The converter below follows the same full-context label logic used in training: + +```bash +pip install pyopenjtalk +``` + +```python +import re + +import pyopenjtalk + + +JAPANESE_VOWELS = "aiueoAIUEON" + + +def japanese_to_romaji_with_accent( + sentence: str, + boundary: bool = False, + rise_edge: str = "J", + fall_edge: str = "L", +) -> str: + text = "" + labels = pyopenjtalk.extract_fullcontext(sentence) + level = -1 + + for index, label in enumerate(labels): + phoneme = re.search(r"\-([^\+]*)\+", label).group(1) + if phoneme in ["sil", "pau"]: + continue + + text += phoneme + + a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1)) + a2 = int(re.search(r"\+(\d+)\+", label).group(1)) + a3 = int(re.search(r"\+(\d+)/", label).group(1)) + + next_phoneme = re.search(r"\-([^\+]*)\+", labels[index + 1]).group(1) + if next_phoneme in ["sil", "pau"]: + a2_next = -1 + else: + a2_next = int(re.search(r"\+(\d+)\+", labels[index + 1]).group(1)) + + # Accent phrase boundary + if a3 == 1 and a2_next == 1: + if boundary: + if level >= 0: + text += " " + else: + if level >= 0: + text += str(level) + level = -1 + # Falling + elif a1 == 0 and a2_next == a2 + 1: + level = 0 + if boundary: + text += fall_edge + else: + text += "1" + # Rising + elif a2 == 1 and a2_next == 2: + level = 1 + if boundary: + text += rise_edge + else: + text += "0" + elif phoneme in JAPANESE_VOWELS: + if level < 0: + level = 0 + if not boundary: + text += str(level) + + return text + + +print(japanese_to_romaji_with_accent("橋が")) +# ha0shi1ga0 +print(japanese_to_romaji_with_accent("橋が", boundary=True)) +# haJshiLga +``` + +Then place the result inside the phoneme tags: + +```text +<|phoneme_start|>ha0shi1ga0<|phoneme_end|> +``` + +Minimal request body: + +```json +{ + "text": "<|phoneme_start|>ha0shi1ga0<|phoneme_end|>見えます。" +} +``` + +## Processing Longer Text + +For long Japanese text, split on punctuation and tag short Japanese runs instead of wrapping an entire paragraph. The training augmentation used short segments and skipped empty or very long spans. + +Good: + +```text +<|phoneme_start|>ha0shi1ga0<|phoneme_end|>、見えます。 +``` + +Avoid: + +```text +<|phoneme_start|>very long paragraph with multiple clauses...<|phoneme_end|> +``` + +If your text contains symbols that OpenJTalk should read as words, normalize them before conversion. For example, the training preprocessor converted `%` to `パーセント` before extracting phonemes. diff --git a/docs.json b/docs.json index 0b54fa3..88bb795 100644 --- a/docs.json +++ b/docs.json @@ -36,7 +36,16 @@ "pages": [ "developer-guide/core-features/text-to-speech", "developer-guide/core-features/emotions", - "developer-guide/core-features/fine-grained-control", + { + "group": "Fine-grained Control", + "icon": "sliders", + "pages": [ + "developer-guide/core-features/fine-grained-control", + "developer-guide/core-features/fine-grained-control/english", + "developer-guide/core-features/fine-grained-control/chinese", + "developer-guide/core-features/fine-grained-control/japanese" + ] + }, "developer-guide/core-features/creating-models", "developer-guide/core-features/speech-to-text" ] diff --git a/llms.txt b/llms.txt index a90ec01..f673a75 100644 --- a/llms.txt +++ b/llms.txt @@ -45,6 +45,9 @@ - [Creating Voice Models](https://docs.fish.audio/developer-guide/core-features/creating-models.md): Learn how to create custom voice models with Fish Audio. - [Emotion Control](https://docs.fish.audio/developer-guide/core-features/emotions.md): Add natural emotions and expressions to your AI-generated speech. - [Fine-grained Control](https://docs.fish.audio/developer-guide/core-features/fine-grained-control.md): Advanced control over speech generation. +- [English Phoneme Control](https://docs.fish.audio/developer-guide/core-features/fine-grained-control/english.md): Control English pronunciation with CMU Arpabet or IPA. +- [Chinese Phoneme Control](https://docs.fish.audio/developer-guide/core-features/fine-grained-control/chinese.md): Control Chinese pronunciation with tone-number pinyin. +- [Japanese Phoneme Control](https://docs.fish.audio/developer-guide/core-features/fine-grained-control/japanese.md): Control Japanese pronunciation with romaji phonemes and pitch accent markers. - [Voice Cloning Best Practices](https://docs.fish.audio/developer-guide/best-practices/voice-cloning.md): Improve voice cloning quality and consistency. - [Real-time Voice Streaming](https://docs.fish.audio/developer-guide/best-practices/real-time-streaming.md): Stream voice generation in real time for interactive applications. From a2484ee00251d8635394cc9715b9a513d47fce1b Mon Sep 17 00:00:00 2001 From: Kilerd Chan Date: Thu, 7 May 2026 14:59:17 +0900 Subject: [PATCH 3/4] docs: clarify English phoneme format --- .../core-features/fine-grained-control.mdx | 5 ++-- .../fine-grained-control/english.mdx | 25 ++++++------------- llms.txt | 2 +- 3 files changed, 11 insertions(+), 21 deletions(-) diff --git a/developer-guide/core-features/fine-grained-control.mdx b/developer-guide/core-features/fine-grained-control.mdx index 1616f28..d2070f8 100644 --- a/developer-guide/core-features/fine-grained-control.mdx +++ b/developer-guide/core-features/fine-grained-control.mdx @@ -32,7 +32,7 @@ Phoneme control allows you to specify exact pronunciations for words, characters The replacement scope depends on the language: -- English: replace one word with CMU Arpabet or IPA. +- English: replace one word with CMU Arpabet. - Chinese: replace one character or syllable with tone-number pinyin. - Japanese: replace a short Japanese word or phrase with OpenJTalk-style romaji and pitch accent markers. @@ -42,8 +42,7 @@ The replacement scope depends on the language: icon="language" href="/developer-guide/core-features/fine-grained-control/english" > - CMU Arpabet and IPA examples for names, homographs, acronyms, and technical - terms. + CMU Arpabet examples for names, homographs, acronyms, and technical terms. ` and `<|phoneme_end|>`, and keep surrounding punctuation outside the tag. @@ -17,6 +14,11 @@ Wrap the pronunciation for one word in `<|phoneme_start|>` and `<|phoneme_end|>` I am an <|phoneme_start|>EH1 N JH AH0 N IH1 R<|phoneme_end|>. ``` + + IPA is not supported for English phoneme tags. Convert IPA pronunciations to + CMU Arpabet before using phoneme control. + + ## CMU Arpabet CMU Arpabet is written as space-separated uppercase symbols. Vowels can include stress digits: @@ -34,17 +36,6 @@ With phoneme control: I am an <|phoneme_start|>EH1 N JH AH0 N IH1 R<|phoneme_end You can omit stress digits when you only need a rough pronunciation, but CMUdict-style output with stress digits usually gives the model the clearest signal. -## IPA - -IPA can be written as a continuous IPA string inside the same phoneme tags. Do not include dictionary slashes unless you intentionally want those characters spoken. - -```text -Standard: The data is ready. -With phoneme control: The <|phoneme_start|>ˈdeɪtə<|phoneme_end|> is ready. -``` - -Use one notation style inside each tag. For example, do not mix `AH0` and `ə` in the same phoneme span. - ## Common Examples Use phoneme control when spelling alone is ambiguous: @@ -103,5 +94,5 @@ CMUdict may contain multiple pronunciations for the same word. Listen to the res - Replace only the word whose pronunciation needs control. - Strip punctuation before dictionary lookup, then place punctuation after the tag. -- Use CMU Arpabet for dictionary-derived pronunciations and IPA when your source material already uses IPA. +- Use CMU Arpabet for English phoneme tags. - For names and brands, write the pronunciation that you want the listener to hear, not necessarily the spelling. diff --git a/llms.txt b/llms.txt index f673a75..4b88bc6 100644 --- a/llms.txt +++ b/llms.txt @@ -45,7 +45,7 @@ - [Creating Voice Models](https://docs.fish.audio/developer-guide/core-features/creating-models.md): Learn how to create custom voice models with Fish Audio. - [Emotion Control](https://docs.fish.audio/developer-guide/core-features/emotions.md): Add natural emotions and expressions to your AI-generated speech. - [Fine-grained Control](https://docs.fish.audio/developer-guide/core-features/fine-grained-control.md): Advanced control over speech generation. -- [English Phoneme Control](https://docs.fish.audio/developer-guide/core-features/fine-grained-control/english.md): Control English pronunciation with CMU Arpabet or IPA. +- [English Phoneme Control](https://docs.fish.audio/developer-guide/core-features/fine-grained-control/english.md): Control English pronunciation with CMU Arpabet. - [Chinese Phoneme Control](https://docs.fish.audio/developer-guide/core-features/fine-grained-control/chinese.md): Control Chinese pronunciation with tone-number pinyin. - [Japanese Phoneme Control](https://docs.fish.audio/developer-guide/core-features/fine-grained-control/japanese.md): Control Japanese pronunciation with romaji phonemes and pitch accent markers. - [Voice Cloning Best Practices](https://docs.fish.audio/developer-guide/best-practices/voice-cloning.md): Improve voice cloning quality and consistency. From 42459b12bfed95d45883690901b1f675c80497c4 Mon Sep 17 00:00:00 2001 From: Kilerd Chan Date: Thu, 7 May 2026 15:00:28 +0900 Subject: [PATCH 4/4] docs: link CMUdict phoneme symbols --- developer-guide/core-features/fine-grained-control/english.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/developer-guide/core-features/fine-grained-control/english.mdx b/developer-guide/core-features/fine-grained-control/english.mdx index 38c6ae3..08510d8 100644 --- a/developer-guide/core-features/fine-grained-control/english.mdx +++ b/developer-guide/core-features/fine-grained-control/english.mdx @@ -27,6 +27,8 @@ CMU Arpabet is written as space-separated uppercase symbols. Vowels can include - `1` for primary stress. - `2` for secondary stress. +For the full symbol inventory, see the CMUdict [`cmudict.symbols`](https://github.com/cmusphinx/cmudict/blob/master/cmudict.symbols) list. You can also look up words on the [CMU Pronouncing Dictionary](http://www.speech.cs.cmu.edu/cgi-bin/cmudict) page. + Example: ```text