Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 125 additions & 21 deletions Releases/v3.0/.claude/VoiceServer/server.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
#!/usr/bin/env bun
/**
* Voice Server - Personal AI Voice notification server using ElevenLabs TTS
* Voice Server - Personal AI Voice notification server with multi-provider TTS
*
* Supported TTS providers:
* - ElevenLabs (default) — high-quality AI voices, 10K free chars/month
* - Google Cloud TTS — WaveNet/Neural2/Standard voices, up to 4M free chars/month
*
* Provider selection: settings.json daidentity.ttsProvider ("elevenlabs" | "google-cloud")
* Falls back to ElevenLabs if not specified (backwards compatible).
*
* Architecture: Pure pass-through. All voice config comes from settings.json.
* The server has zero hardcoded voice parameters.
Expand Down Expand Up @@ -34,11 +41,9 @@ if (existsSync(envPath)) {

const PORT = parseInt(process.env.PORT || "8888");
const ELEVENLABS_API_KEY = process.env.ELEVENLABS_API_KEY;
const GOOGLE_CLOUD_API_KEY = process.env.GOOGLE_CLOUD_API_KEY;

if (!ELEVENLABS_API_KEY) {
console.error('⚠️ ELEVENLABS_API_KEY not found in ~/.env');
console.error('Add: ELEVENLABS_API_KEY=your_key_here');
}
// TTS provider is resolved after settings.json is loaded (see below)

// ==========================================================================
// Pronunciation System
Expand Down Expand Up @@ -130,9 +135,31 @@ interface VoiceEntry {
volume: number;
}

// TTS provider type
type TtsProvider = 'elevenlabs' | 'google-cloud';

// Google Cloud TTS voice configuration
interface GoogleCloudVoiceConfig {
languageCode: string; // e.g. "en-US"
voiceName: string; // e.g. "en-US-Neural2-D"
voiceType: 'STANDARD' | 'WAVENET' | 'NEURAL2';
speakingRate: number; // 0.25 to 4.0, default 1.0
pitch: number; // -20.0 to 20.0, default 0.0
}

const FALLBACK_GOOGLE_VOICE: GoogleCloudVoiceConfig = {
languageCode: 'en-US',
voiceName: 'en-US-Neural2-D',
voiceType: 'NEURAL2',
speakingRate: 1.0,
pitch: 0.0,
};

// Loaded config from settings.json
interface LoadedVoiceConfig {
defaultVoiceId: string;
ttsProvider: TtsProvider;
googleVoice: GoogleCloudVoiceConfig;
voices: Record<string, VoiceEntry>; // keyed by name ("main", "algorithm")
voicesByVoiceId: Record<string, VoiceEntry>; // keyed by voiceId for lookup
desktopNotifications: boolean; // whether to show macOS notification banners
Expand All @@ -155,7 +182,7 @@ function loadVoiceConfig(): LoadedVoiceConfig {
try {
if (!existsSync(settingsPath)) {
console.warn('⚠️ settings.json not found — using fallback voice defaults');
return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true };
return { defaultVoiceId: '', ttsProvider: 'elevenlabs', googleVoice: FALLBACK_GOOGLE_VOICE, voices: {}, voicesByVoiceId: {}, desktopNotifications: true };
}

const content = readFileSync(settingsPath, 'utf-8');
Expand Down Expand Up @@ -189,16 +216,30 @@ function loadVoiceConfig(): LoadedVoiceConfig {
// Default voice ID from settings
const defaultVoiceId = voices.main?.voiceId || daidentity.mainDAVoiceID || '';

// TTS provider selection
const ttsProvider: TtsProvider = daidentity.ttsProvider === 'google-cloud' ? 'google-cloud' : 'elevenlabs';

// Google Cloud voice config
const gcVoice = daidentity.googleCloudVoice || {};
const googleVoice: GoogleCloudVoiceConfig = {
languageCode: gcVoice.languageCode || FALLBACK_GOOGLE_VOICE.languageCode,
voiceName: gcVoice.voiceName || FALLBACK_GOOGLE_VOICE.voiceName,
voiceType: gcVoice.voiceType || FALLBACK_GOOGLE_VOICE.voiceType,
speakingRate: gcVoice.speakingRate ?? FALLBACK_GOOGLE_VOICE.speakingRate,
pitch: gcVoice.pitch ?? FALLBACK_GOOGLE_VOICE.pitch,
};

const voiceNames = Object.keys(voices);
console.log(`✅ Loaded ${voiceNames.length} voice config(s) from settings.json: ${voiceNames.join(', ')}`);
console.log(`🔊 TTS provider: ${ttsProvider}`);
for (const [name, entry] of Object.entries(voices)) {
console.log(` ${name}: ${entry.voiceName || entry.voiceId} (speed: ${entry.speed}, stability: ${entry.stability})`);
}

return { defaultVoiceId, voices, voicesByVoiceId, desktopNotifications };
return { defaultVoiceId, ttsProvider, googleVoice, voices, voicesByVoiceId, desktopNotifications };
} catch (error) {
console.error('⚠️ Failed to load settings.json voice config:', error);
return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true };
return { defaultVoiceId: '', ttsProvider: 'elevenlabs', googleVoice: FALLBACK_GOOGLE_VOICE, voices: {}, voicesByVoiceId: {}, desktopNotifications: true };
}
}

Expand Down Expand Up @@ -330,19 +371,13 @@ function validateInput(input: any): { valid: boolean; error?: string; sanitized?
}

// Generate speech using ElevenLabs API — pure pass-through of voice_settings
async function generateSpeech(
async function generateSpeechElevenLabs(
text: string,
voiceId: string,
voiceSettings: ElevenLabsVoiceSettings
): Promise<ArrayBuffer> {
if (!ELEVENLABS_API_KEY) {
throw new Error('ElevenLabs API key not configured');
}

// Apply pronunciation replacements before sending to TTS
const pronouncedText = applyPronunciations(text);
if (pronouncedText !== text) {
console.log(`📖 Pronunciation: "${text}" → "${pronouncedText}"`);
throw new Error('ElevenLabs API key not configured — add ELEVENLABS_API_KEY to ~/.env');
}

const url = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`;
Expand All @@ -355,7 +390,7 @@ async function generateSpeech(
'xi-api-key': ELEVENLABS_API_KEY,
},
body: JSON.stringify({
text: pronouncedText,
text,
model_id: 'eleven_turbo_v2_5',
voice_settings: voiceSettings,
}),
Expand All @@ -369,6 +404,68 @@ async function generateSpeech(
return await response.arrayBuffer();
}

// Generate speech using Google Cloud Text-to-Speech REST API (no SDK dependency)
async function generateSpeechGoogleCloud(
text: string,
gcVoice: GoogleCloudVoiceConfig
): Promise<ArrayBuffer> {
if (!GOOGLE_CLOUD_API_KEY) {
throw new Error('Google Cloud API key not configured — add GOOGLE_CLOUD_API_KEY to ~/.env');
}

const url = `https://texttospeech.googleapis.com/v1/text:synthesize?key=${GOOGLE_CLOUD_API_KEY}`;

const response = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
input: { text },
voice: {
languageCode: gcVoice.languageCode,
name: gcVoice.voiceName,
},
audioConfig: {
audioEncoding: 'MP3',
speakingRate: gcVoice.speakingRate,
pitch: gcVoice.pitch,
},
}),
});

if (!response.ok) {
const errorText = await response.text();
throw new Error(`Google Cloud TTS API error: ${response.status} - ${errorText}`);
}

const data = await response.json() as { audioContent: string };

// Google returns base64-encoded audio — decode to ArrayBuffer
const binaryString = atob(data.audioContent);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
return bytes.buffer;
}

// Route TTS generation to the configured provider
async function generateSpeech(
text: string,
voiceId: string,
voiceSettings: ElevenLabsVoiceSettings
): Promise<ArrayBuffer> {
// Apply pronunciation replacements before sending to TTS
const pronouncedText = applyPronunciations(text);
if (pronouncedText !== text) {
console.log(`📖 Pronunciation: "${text}" → "${pronouncedText}"`);
}

if (voiceConfig.ttsProvider === 'google-cloud') {
return generateSpeechGoogleCloud(pronouncedText, voiceConfig.googleVoice);
}
return generateSpeechElevenLabs(pronouncedText, voiceId, voiceSettings);
}

// Play audio using afplay (macOS)
async function playAudio(audioBuffer: ArrayBuffer, volume: number = FALLBACK_VOLUME): Promise<void> {
const tempFile = `/tmp/voice-${Date.now()}.mp3`;
Expand Down Expand Up @@ -688,9 +785,10 @@ const server = serve({
JSON.stringify({
status: "healthy",
port: PORT,
voice_system: "ElevenLabs",
voice_system: voiceConfig.ttsProvider,
default_voice_id: DEFAULT_VOICE_ID,
api_key_configured: !!ELEVENLABS_API_KEY,
elevenlabs_configured: !!ELEVENLABS_API_KEY,
google_cloud_configured: !!GOOGLE_CLOUD_API_KEY,
pronunciation_rules: pronunciationRules.length,
configured_voices: Object.keys(voiceConfig.voices),
}),
Expand All @@ -709,8 +807,14 @@ const server = serve({
});

console.log(`🚀 Voice Server running on port ${PORT}`);
console.log(`🎙️ Using ElevenLabs TTS (default voice: ${DEFAULT_VOICE_ID})`);
console.log(`🔊 TTS provider: ${voiceConfig.ttsProvider}`);
if (voiceConfig.ttsProvider === 'google-cloud') {
console.log(`🎙️ Using Google Cloud TTS (voice: ${voiceConfig.googleVoice.voiceName}, type: ${voiceConfig.googleVoice.voiceType})`);
console.log(`🔑 Google Cloud API Key: ${GOOGLE_CLOUD_API_KEY ? '✅ Configured' : '❌ Missing'}`);
} else {
console.log(`🎙️ Using ElevenLabs TTS (default voice: ${DEFAULT_VOICE_ID})`);
console.log(`🔑 ElevenLabs API Key: ${ELEVENLABS_API_KEY ? '✅ Configured' : '❌ Missing'}`);
}
console.log(`📡 POST to http://localhost:${PORT}/notify`);
console.log(`🔒 Security: CORS restricted to localhost, rate limiting enabled`);
console.log(`🔑 API Key: ${ELEVENLABS_API_KEY ? '✅ Configured' : '❌ Missing'}`);
console.log(`📖 Pronunciations: ${pronunciationRules.length} rules loaded`);
Loading