From bb63b17d66979f8034d6bafc134c940ba35d6186 Mon Sep 17 00:00:00 2001 From: Owen McGirr Date: Sat, 11 Apr 2026 15:01:53 +0100 Subject: [PATCH] fix(azure): fix double-prosody and volume scale when prosody options are provided Two bugs in ensureAzureSSMLStructure when rate/pitch/volume options are passed: 1. Double-nested : this.properties defaults (rate="medium", pitch="medium", volume=100) are always truthy, so the first block always added a default prosody, then the options block wrapped it with a second one. Merged both blocks into one: options override properties defaults, and a prosody element is only emitted when at least one value differs from Azure's implicit defaults. 2. Volume scale: callers commonly pass volume as a 0-1 float (e.g. 0.8), but the template literal appended "%" directly, producing volume="0.8%" (essentially silent) instead of volume="80%". Values in the range (0, 1] are now treated as fractions and normalised to the 0-100 scale before formatting. Fixes #40 --- src/__tests__/azure-mstts-namespace.test.ts | 35 +++++++++++ src/engines/azure.ts | 64 +++++++++++---------- 2 files changed, 69 insertions(+), 30 deletions(-) diff --git a/src/__tests__/azure-mstts-namespace.test.ts b/src/__tests__/azure-mstts-namespace.test.ts index d5d291e..c6e0462 100644 --- a/src/__tests__/azure-mstts-namespace.test.ts +++ b/src/__tests__/azure-mstts-namespace.test.ts @@ -149,5 +149,40 @@ describe("Azure MSTTS Namespace Handling", () => { expect(result).toMatch(/]*>\s*]*>/); expect(result).toMatch(/<\/prosody>\s*<\/voice>/); }); + + it("should produce a single element (not double-nested) when options are provided", async () => { + // Regression test for: https://github.com/willwade/js-tts-wrapper/issues/40 + // this.properties defaults (rate="medium", pitch="medium", volume=100) were always + // truthy, causing a first prosody to be added, then options adding a second one on top. + const plainSSML = `Hello world`; + const options = { rate: "fast", pitch: "high", volume: 80 }; + + const result = (client as any).ensureAzureSSMLStructure(plainSSML, "en-US-JennyNeural", options); + + const prosodyMatches = result.match(/ when all values are at Azure defaults", async () => { + // No prosody element needed when everything is at the implicit default + const plainSSML = `Hello world`; + const options = { rate: "medium", pitch: "medium", volume: 100 }; + + const result = (client as any).ensureAzureSSMLStructure(plainSSML, "en-US-JennyNeural", options); + + expect(result).not.toContain(" { + // Regression test for: https://github.com/willwade/js-tts-wrapper/issues/40 + // Callers commonly pass volume as a 0-1 float; 0.8 should become volume="80%", not "0.8%". + const plainSSML = `Hello world`; + const options = { volume: 0.8 }; + + const result = (client as any).ensureAzureSSMLStructure(plainSSML, "en-US-JennyNeural", options); + + expect(result).toContain('volume="80%"'); + expect(result).not.toContain('volume="0.8%"'); + }); }); }); diff --git a/src/engines/azure.ts b/src/engines/azure.ts index 421273d..8f66355 100644 --- a/src/engines/azure.ts +++ b/src/engines/azure.ts @@ -616,38 +616,42 @@ export class AzureTTSClient extends AbstractTTSClient { } } - // Add prosody if properties are set - if (this.properties.rate || this.properties.pitch || this.properties.volume) { - // Extract content between voice tags or speak tags - let content = ""; - if (ssml.includes("]*>(.*?)<\/voice>/s); - if (match) { - content = match[1]; - const prosodyContent = this.constructProsodyTag(content); - ssml = ssml.replace(content, prosodyContent); - } - } else { - const match = ssml.match(/]*>(.*?)<\/speak>/s); - if (match) { - content = match[1]; - const prosodyContent = this.constructProsodyTag(content); - ssml = ssml.replace(content, prosodyContent); - } + // Build prosody attributes by merging this.properties defaults with per-call options. + // Options take precedence. We only emit a element when at least one + // attribute differs from Azure's implicit defaults (medium/medium/100%), to avoid + // wrapping content in a no-op element. + { + const DEFAULT_RATE = "medium"; + const DEFAULT_PITCH = "medium"; + const DEFAULT_VOLUME = 100; + + const rate = options?.rate ?? (this.properties.rate as string | undefined); + const pitch = options?.pitch ?? (this.properties.pitch as string | undefined); + // volume: SpeakOptions types volume as 0-100. Guard against callers who pass a + // 0-1 fraction by normalising: any value ≤ 1 (and > 0) is treated as a fraction + // and scaled to 0-100. + let rawVolume: number | undefined = + options?.volume !== undefined + ? options.volume + : (this.properties.volume as number | undefined); + if (rawVolume !== undefined && rawVolume > 0 && rawVolume <= 1) { + rawVolume = Math.round(rawVolume * 100); } - } + const volume = rawVolume !== undefined ? rawVolume : DEFAULT_VOLUME; + + const hasNonDefaultProsody = + (rate !== undefined && rate !== DEFAULT_RATE) || + (pitch !== undefined && pitch !== DEFAULT_PITCH) || + volume !== DEFAULT_VOLUME; + + if (hasNonDefaultProsody) { + const attrs: string[] = []; + if (rate && rate !== DEFAULT_RATE) attrs.push(`rate="${rate}"`); + if (pitch && pitch !== DEFAULT_PITCH) attrs.push(`pitch="${pitch}"`); + if (volume !== DEFAULT_VOLUME) attrs.push(`volume="${volume}%"`); - // Also add prosody from options if provided - if (options?.rate || options?.pitch || options?.volume !== undefined) { - // Create prosody attributes - const attrs: string[] = []; - if (options.rate) attrs.push(`rate="${options.rate}"`); - if (options.pitch) attrs.push(`pitch="${options.pitch}"`); - if (options.volume !== undefined) attrs.push(`volume="${options.volume}%"`); - - if (attrs.length > 0) { - // Extract content from inside if present, otherwise from . - // Prosody must be nested inside , not as a direct child of . + // must be nested inside , not as a direct child of . + // Azure rejects: Node [speak] should not contain node [prosody] with type [Others]. if (ssml.includes("]*>(.*?)<\/voice>/s); if (match) {