Add traffictype in speech tts requests. (#1431)

ms-feizhao · web-flow · commit 429fbfa1afcc · 2026-01-05T11:28:53.000-08:00
* add traffic log to local mcp

* fix SR live tests
diff --git a/tools/Azure.Mcp.Tools.Speech/src/Services/Synthesizers/RealtimeTtsSynthesizer.cs b/tools/Azure.Mcp.Tools.Speech/src/Services/Synthesizers/RealtimeTtsSynthesizer.cs
@@ -105,8 +105,13 @@ public async Task<SynthesisResult> SynthesizeToFileAsync(
         var tokenRequestContext = new TokenRequestContext(["https://cognitiveservices.azure.com/.default"]);
         var accessToken = await credential.GetTokenAsync(tokenRequestContext, cancellationToken);
 
+        // Convert https endpoint to wss for WebSocket-based TTS
+        var wssEndpoint = endpoint
+            .Replace("https://", "wss://", StringComparison.OrdinalIgnoreCase)
+            .TrimEnd('/') + "/tts/cognitiveservices/websocket/v1?traffictype=localmcp";
+
         // Configure Speech SDK with endpoint
-        var config = SpeechConfig.FromEndpoint(new Uri(endpoint));
+        var config = SpeechConfig.FromEndpoint(new Uri(wssEndpoint));
 
         // Set the authorization token
         config.AuthorizationToken = accessToken.Token;
diff --git a/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs b/tools/Azure.Mcp.Tools.Speech/src/SpeechSetup.cs
@@ -40,8 +40,8 @@ public CommandGroup RegisterCommands(IServiceProvider serviceProvider)
         var speech = new CommandGroup(Name,
             """
             Speech operations - Commands for Azure AI Services Speech functionality including speech-to-text (STT) 
-            recognition, audio processing, and language detection. Use this tool when you need to convert spoken 
-            audio to text, process audio files, or work with speech recognition services. This tool supports 
+            recognition, text-to-speech (TTS) synthesis, audio processing, and language detection. Use this tool when you need to convert spoken 
+            audio to text, convert text to spoken audio, process audio files, or work with speech recognition services. This tool supports 
             multiple audio formats, configurable recognition languages, profanity filtering options, and both 
             simple and detailed output formats. This tool is a hierarchical MCP command router where sub-commands 
             are routed to MCP servers that require specific fields inside the "parameters" object. To invoke a 
diff --git a/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs b/tools/Azure.Mcp.Tools.Speech/tests/Azure.Mcp.Tools.Speech.LiveTests/SpeechCommandTests.cs
@@ -40,8 +40,8 @@ public async Task SpeechToText_ShouldHandleMissingAudioFileGracefully()
     [InlineData("en-US", "TheGreatGatsby.wav", "In my younger and more vulnerable years, my father gave me some advice that I've been turning over in my mind ever since. Whenever you feel like criticizing anyone, he told me, just remember that all the people in this world haven't had the advantages that you've had. He didn't say anymore, but we've always been unusually commutative in a reserved way, and I understood that he meant a great deal more than that. In consequence, I'm inclined to reserve all judgments, a habit that has opened up many curious natures to me.")]
     [InlineData("ar-AE", "ar-rewind-music.wav", "ارجع الموسيقى 20 ثانية.")]
     [InlineData("es-ES", "es-ES.wav", "Rebobinar la música 20 segundos.")]
-    [InlineData("fr-FR", "fr-FR.wav", "Rembobinez la musique de Vingt secondes.")]
-    [InlineData("de-DE", "de-DE.wav", "Treffen heute um 17 Uhr.")]
+    [InlineData("fr-FR", "fr-FR.wav", "Rembobinez la musique de vingt secondes.")]
+    [InlineData("de-DE", "de-DE.wav", "Treffen heute um 17 Uhr")]
     public async Task SpeechToText_WithFastSupportedLanguage_ShouldRecognizeSpeechWithFastTranscription(string? language, string fileName, string expectedText)
     {
         // Arrange