/** * ARIA XTTS Bridge — Verbindet XTTS v2 Server mit dem RVS * * Empfaengt tts_request ueber RVS → rendert Audio via XTTS API → sendet zurueck * Empfaengt voice_upload → speichert Voice-Sample fuer Cloning * Empfaengt xtts_list_voices → listet verfuegbare Stimmen */ const WebSocket = require("ws"); const http = require("http"); const https = require("https"); const fs = require("fs"); const path = require("path"); const XTTS_API_URL = process.env.XTTS_API_URL || "http://xtts:8000"; const RVS_HOST = process.env.RVS_HOST || ""; const RVS_PORT = process.env.RVS_PORT || "443"; const RVS_TLS = process.env.RVS_TLS || "true"; const RVS_TLS_FALLBACK = process.env.RVS_TLS_FALLBACK || "true"; const RVS_TOKEN = process.env.RVS_TOKEN || ""; const VOICES_DIR = "/voices"; function log(msg) { console.log(`[${new Date().toISOString()}] ${msg}`); } // ── RVS Verbindung ────────────────────────────────── let rvsWs = null; let retryDelay = 2; function connectRVS(forcePlain) { if (!RVS_HOST || !RVS_TOKEN) { log("RVS nicht konfiguriert — beende"); process.exit(1); } const useTls = RVS_TLS === "true" && !forcePlain; const proto = useTls ? "wss" : "ws"; const url = `${proto}://${RVS_HOST}:${RVS_PORT}?token=${RVS_TOKEN}`; log(`Verbinde zu RVS: ${proto}://${RVS_HOST}:${RVS_PORT}`); const ws = new WebSocket(url); ws.on("open", () => { log("RVS verbunden — warte auf TTS-Requests"); rvsWs = ws; retryDelay = 2; // Keepalive setInterval(() => { if (ws.readyState === WebSocket.OPEN) { ws.ping(); ws.send(JSON.stringify({ type: "heartbeat", timestamp: Date.now() })); } }, 25000); }); ws.on("message", async (raw) => { try { const msg = JSON.parse(raw.toString()); if (msg.type === "xtts_request") { await handleTTSRequest(msg.payload); } else if (msg.type === "voice_upload") { await handleVoiceUpload(msg.payload); } else if (msg.type === "xtts_list_voices") { await handleListVoices(); } } catch (err) { log(`Fehler: ${err.message}`); } }); ws.on("close", () => { log("RVS Verbindung geschlossen"); rvsWs = null; setTimeout(() => connectRVS(), Math.min(retryDelay * 1000, 30000)); retryDelay = Math.min(retryDelay * 2, 30); }); ws.on("error", (err) => { log(`RVS Fehler: ${err.message}`); if (useTls && RVS_TLS_FALLBACK === "true") { log("TLS fehlgeschlagen — Fallback auf ws://"); ws.removeAllListeners(); try { ws.close(); } catch (_) {} connectRVS(true); } }); } // ── TTS Request Handler ───────────────────────────── async function handleTTSRequest(payload) { const { text, voice, requestId, language } = payload; if (!text) return; // Markdown + Sonderzeichen entfernen fuer natuerliche Sprache let cleanText = text .replace(/\*\*([^*]+)\*\*/g, "$1") // **fett** → fett .replace(/\*([^*]+)\*/g, "$1") // *kursiv* → kursiv .replace(/`([^`]+)`/g, "$1") // `code` → code .replace(/```[\s\S]*?```/g, "") // Code-Bloecke entfernen .replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") // [text](url) → text .replace(/#{1,6}\s*/g, "") // ### Ueberschriften → entfernen .replace(/>\s*/g, "") // > Zitate → entfernen .replace(/[-*]\s+/g, "") // - Listen → entfernen .replace(/\n{2,}/g, ". ") // Mehrere Newlines → Punkt .replace(/\n/g, ", ") // Einzelne Newlines → Komma .replace(/\s{2,}/g, " ") // Mehrfach-Leerzeichen .replace(/["""„]/g, "") // Anfuehrungszeichen entfernen .replace(/\(\)/g, "") // Leere Klammern .trim(); // Text in Saetze aufteilen, dann zu Chunks von 2-3 Saetzen zusammenfassen // (mehr Kontext = konsistentere Stimme/Lautstaerke, aber nicht zu lang fuer WebSocket) const sentences = cleanText.split(/(?<=[.!?])\s+/) .map(s => s.trim()) .filter(s => s.length > 0) .map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen const MAX_CHUNK_CHARS = 150; // Max ~150 Zeichen pro Chunk (schnelles Rendering, Preloading reicht) const chunks = []; let currentChunk = ''; for (const sentence of sentences) { if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) { chunks.push(currentChunk); currentChunk = sentence; } else { currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence; } } if (currentChunk) chunks.push(currentChunk); if (chunks.length === 0) return; log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze → ${chunks.length} Chunks, voice: ${voice || "default"}, lang: ${language || "de"})`); try { const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null; const hasCustomVoice = voiceSample && fs.existsSync(voiceSample); // Streaming: Chunk rendern → sofort senden → naechster Chunk // App spielt mit Preloading-Queue nahtlos ab let sentCount = 0; for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; try { const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null); if (audioBuffer && audioBuffer.length > 100) { log(`TTS [${i + 1}/${chunks.length}]: ${(audioBuffer.length / 1024).toFixed(0)}KB — "${chunk.slice(0, 50)}"`); sendToRVS({ type: "xtts_response", payload: { requestId: `${requestId || ""}_${i}`, base64: audioBuffer.toString("base64"), mimeType: "audio/wav", voice: voice || "default", engine: "xtts", part: i + 1, totalParts: chunks.length, }, timestamp: Date.now(), }); sentCount++; } } catch (chunkErr) { log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`); } } log(`TTS komplett: ${sentCount}/${chunks.length} Chunks gestreamt`); } catch (err) { log(`TTS Fehler: ${err.message}`); sendToRVS({ type: "xtts_response", payload: { requestId, error: err.message }, timestamp: Date.now(), }); } } function callXTTSAPI(text, language, speakerWav) { return new Promise((resolve, reject) => { const body = JSON.stringify({ text, language, speaker_wav: speakerWav || "", }); const url = new URL(`${XTTS_API_URL}/tts_to_audio/`); const options = { hostname: url.hostname, port: url.port, path: url.pathname, method: "POST", headers: { "Content-Type": "application/json", "Content-Length": Buffer.byteLength(body), }, timeout: 60000, }; const req = http.request(options, (res) => { const chunks = []; res.on("data", (chunk) => chunks.push(chunk)); res.on("end", () => { if (res.statusCode === 200) { resolve(Buffer.concat(chunks)); } else { reject(new Error(`XTTS API HTTP ${res.statusCode}: ${Buffer.concat(chunks).toString().slice(0, 200)}`)); } }); }); req.on("error", reject); req.on("timeout", () => { req.destroy(); reject(new Error("XTTS API Timeout (60s)")); }); req.write(body); req.end(); }); } // ── Voice Upload Handler ──────────────────────────── async function handleVoiceUpload(payload) { const { name, samples } = payload; if (!name || !samples || !Array.isArray(samples) || samples.length === 0) { log("Voice Upload: Ungueltige Daten"); return; } log(`Voice Upload: "${name}" (${samples.length} Samples)`); try { // Alle Samples zusammenfuegen const buffers = samples.map(s => Buffer.from(s.base64, "base64")); const combined = Buffer.concat(buffers); // Als WAV speichern fs.mkdirSync(VOICES_DIR, { recursive: true }); const filePath = path.join(VOICES_DIR, `${name.replace(/[^a-zA-Z0-9_-]/g, "_")}.wav`); fs.writeFileSync(filePath, combined); log(`Voice gespeichert: ${filePath} (${(combined.length / 1024).toFixed(0)}KB)`); sendToRVS({ type: "xtts_voice_saved", payload: { name, size: combined.length, path: filePath }, timestamp: Date.now(), }); } catch (err) { log(`Voice Upload Fehler: ${err.message}`); } } // ── Voice List Handler ────────────────────────────── async function handleListVoices() { try { const files = fs.existsSync(VOICES_DIR) ? fs.readdirSync(VOICES_DIR).filter(f => f.endsWith(".wav")) : []; const voices = files.map(f => ({ name: path.basename(f, ".wav"), file: f, size: fs.statSync(path.join(VOICES_DIR, f)).size, })); log(`Stimmen: ${voices.length} verfuegbar`); sendToRVS({ type: "xtts_voices_list", payload: { voices }, timestamp: Date.now(), }); } catch (err) { log(`Stimmen-Liste Fehler: ${err.message}`); } } // ── RVS senden ────────────────────────────────────── function sendToRVS(msg) { if (rvsWs && rvsWs.readyState === WebSocket.OPEN) { rvsWs.send(JSON.stringify(msg)); } } // ── Start ─────────────────────────────────────────── log("ARIA XTTS Bridge startet..."); log(`XTTS API: ${XTTS_API_URL}`); log(`RVS: ${RVS_HOST}:${RVS_PORT}`); // Warten bis XTTS API erreichbar ist function waitForXTTS(callback, attempts) { if (attempts <= 0) { log("XTTS API nicht erreichbar — starte trotzdem"); callback(); return; } http.get(`${XTTS_API_URL}/docs`, (res) => { log(`XTTS API erreichbar (HTTP ${res.statusCode})`); callback(); }).on("error", () => { log(`XTTS API noch nicht bereit — warte (${attempts} Versuche uebrig)...`); setTimeout(() => waitForXTTS(callback, attempts - 1), 10000); // 10s statt 5s (Model laden dauert) }); } waitForXTTS(() => connectRVS(), 30); // Max 5min warten