/** * ARIA XTTS Bridge — Verbindet XTTS v2 Server mit dem RVS * * Empfaengt tts_request ueber RVS → rendert Audio via XTTS API → sendet zurueck * Empfaengt voice_upload → speichert Voice-Sample fuer Cloning * Empfaengt xtts_list_voices → listet verfuegbare Stimmen */ const WebSocket = require("ws"); const http = require("http"); const https = require("https"); const fs = require("fs"); const path = require("path"); const XTTS_API_URL = process.env.XTTS_API_URL || "http://xtts:8000"; const RVS_HOST = process.env.RVS_HOST || ""; const RVS_PORT = process.env.RVS_PORT || "443"; const RVS_TLS = process.env.RVS_TLS || "true"; const RVS_TLS_FALLBACK = process.env.RVS_TLS_FALLBACK || "true"; const RVS_TOKEN = process.env.RVS_TOKEN || ""; const VOICES_DIR = "/voices"; function log(msg) { console.log(`[${new Date().toISOString()}] ${msg}`); } // ── RVS Verbindung ────────────────────────────────── let rvsWs = null; let retryDelay = 2; function connectRVS(forcePlain) { if (!RVS_HOST || !RVS_TOKEN) { log("RVS nicht konfiguriert — beende"); process.exit(1); } const useTls = RVS_TLS === "true" && !forcePlain; const proto = useTls ? "wss" : "ws"; const url = `${proto}://${RVS_HOST}:${RVS_PORT}?token=${RVS_TOKEN}`; log(`Verbinde zu RVS: ${proto}://${RVS_HOST}:${RVS_PORT}`); const ws = new WebSocket(url); ws.on("open", () => { log("RVS verbunden — warte auf TTS-Requests"); rvsWs = ws; retryDelay = 2; // Keepalive setInterval(() => { if (ws.readyState === WebSocket.OPEN) { ws.ping(); ws.send(JSON.stringify({ type: "heartbeat", timestamp: Date.now() })); } }, 25000); }); ws.on("message", async (raw) => { try { const msg = JSON.parse(raw.toString()); if (msg.type === "xtts_request") { await handleTTSRequest(msg.payload); } else if (msg.type === "voice_upload") { await handleVoiceUpload(msg.payload); } else if (msg.type === "xtts_list_voices") { await handleListVoices(); } } catch (err) { log(`Fehler: ${err.message}`); } }); ws.on("close", () => { log("RVS Verbindung geschlossen"); rvsWs = null; setTimeout(() => connectRVS(), Math.min(retryDelay * 1000, 30000)); retryDelay = Math.min(retryDelay * 2, 30); }); ws.on("error", (err) => { log(`RVS Fehler: ${err.message}`); if (useTls && RVS_TLS_FALLBACK === "true") { log("TLS fehlgeschlagen — Fallback auf ws://"); ws.removeAllListeners(); try { ws.close(); } catch (_) {} connectRVS(true); } }); } // ── TTS Request Handler ───────────────────────────── async function handleTTSRequest(payload) { const { text, voice, requestId, language } = payload; if (!text) return; // Markdown entfernen const cleanText = text.replace(/\*\*([^*]+)\*\*/g, "$1").trim(); // Text in Saetze aufteilen, dann zu Chunks von 2-3 Saetzen zusammenfassen // (mehr Kontext = konsistentere Stimme/Lautstaerke, aber nicht zu lang fuer WebSocket) const sentences = cleanText.split(/(?<=[.!?])\s+/) .map(s => s.trim()) .filter(s => s.length > 0) .map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen const MAX_CHUNK_CHARS = 250; // Max ~250 Zeichen pro Chunk const chunks = []; let currentChunk = ''; for (const sentence of sentences) { if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) { chunks.push(currentChunk); currentChunk = sentence; } else { currentChunk = currentChunk ? currentChunk + '. ' + sentence : sentence; } } if (currentChunk) chunks.push(currentChunk); if (chunks.length === 0) return; log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze → ${chunks.length} Chunks, voice: ${voice || "default"}, lang: ${language || "de"})`); try { const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null; const hasCustomVoice = voiceSample && fs.existsSync(voiceSample); // Alle Chunks sequentiell rendern und PCM-Daten sammeln const pcmBuffers = []; let sampleRate = 0; let channels = 0; let bitsPerSample = 0; for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; try { const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null); if (audioBuffer && audioBuffer.length > 44) { // WAV-Header parsen (erste 44 bytes) um PCM-Daten zu extrahieren if (sampleRate === 0) { channels = audioBuffer.readUInt16LE(22); sampleRate = audioBuffer.readUInt32LE(24); bitsPerSample = audioBuffer.readUInt16LE(34); } // PCM-Daten ab Byte 44 pcmBuffers.push(audioBuffer.slice(44)); log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes — "${chunk.slice(0, 40)}..."`); } } catch (chunkErr) { log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`); } } if (pcmBuffers.length === 0) { log("TTS: Keine Audio-Daten erzeugt"); sendToRVS({ type: "xtts_response", payload: { requestId, error: "Keine Audio-Daten" }, timestamp: Date.now(), }); return; } // PCM-Daten zusammenfuegen und neuen WAV-Header schreiben const allPcm = Buffer.concat(pcmBuffers); const wavHeader = Buffer.alloc(44); const byteRate = sampleRate * channels * (bitsPerSample / 8); const blockAlign = channels * (bitsPerSample / 8); wavHeader.write("RIFF", 0); wavHeader.writeUInt32LE(36 + allPcm.length, 4); wavHeader.write("WAVE", 8); wavHeader.write("fmt ", 12); wavHeader.writeUInt32LE(16, 16); // Subchunk1Size wavHeader.writeUInt16LE(1, 20); // PCM format wavHeader.writeUInt16LE(channels, 22); wavHeader.writeUInt32LE(sampleRate, 24); wavHeader.writeUInt32LE(byteRate, 28); wavHeader.writeUInt16LE(blockAlign, 32); wavHeader.writeUInt16LE(bitsPerSample, 34); wavHeader.write("data", 36); wavHeader.writeUInt32LE(allPcm.length, 40); const completeWav = Buffer.concat([wavHeader, allPcm]); const base64 = completeWav.toString("base64"); // Wenn zu gross (>800KB PCM) → in Teile splitten, sonst als Ganzes senden const MAX_PCM_SIZE = 800 * 1024; // ~800KB PCM pro Nachricht const pcmParts = []; if (allPcm.length > MAX_PCM_SIZE) { for (let offset = 0; offset < allPcm.length; offset += MAX_PCM_SIZE) { pcmParts.push(allPcm.slice(offset, Math.min(offset + MAX_PCM_SIZE, allPcm.length))); } } else { pcmParts.push(allPcm); } for (let p = 0; p < pcmParts.length; p++) { const partPcm = pcmParts[p]; const partHeader = Buffer.alloc(44); partHeader.write("RIFF", 0); partHeader.writeUInt32LE(36 + partPcm.length, 4); partHeader.write("WAVE", 8); partHeader.write("fmt ", 12); partHeader.writeUInt32LE(16, 16); partHeader.writeUInt16LE(1, 20); partHeader.writeUInt16LE(channels, 22); partHeader.writeUInt32LE(sampleRate, 24); partHeader.writeUInt32LE(byteRate, 28); partHeader.writeUInt16LE(blockAlign, 32); partHeader.writeUInt16LE(bitsPerSample, 34); partHeader.write("data", 36); partHeader.writeUInt32LE(partPcm.length, 40); const partWav = Buffer.concat([partHeader, partPcm]); const partBase64 = partWav.toString("base64"); sendToRVS({ type: "xtts_response", payload: { requestId: `${requestId || ""}${pcmParts.length > 1 ? '_' + p : ''}`, base64: partBase64, mimeType: "audio/wav", voice: voice || "default", engine: "xtts", }, timestamp: Date.now(), }); } const totalSecs = (allPcm.length / byteRate).toFixed(1); log(`TTS komplett: ${chunks.length} Chunks → ${pcmParts.length} Teil(e), ${(allPcm.length / 1024).toFixed(0)}KB, ${totalSecs}s`); } catch (err) { log(`TTS Fehler: ${err.message}`); sendToRVS({ type: "xtts_response", payload: { requestId, error: err.message }, timestamp: Date.now(), }); } } function callXTTSAPI(text, language, speakerWav) { return new Promise((resolve, reject) => { const body = JSON.stringify({ text, language, speaker_wav: speakerWav || "", }); const url = new URL(`${XTTS_API_URL}/tts_to_audio/`); const options = { hostname: url.hostname, port: url.port, path: url.pathname, method: "POST", headers: { "Content-Type": "application/json", "Content-Length": Buffer.byteLength(body), }, timeout: 60000, }; const req = http.request(options, (res) => { const chunks = []; res.on("data", (chunk) => chunks.push(chunk)); res.on("end", () => { if (res.statusCode === 200) { resolve(Buffer.concat(chunks)); } else { reject(new Error(`XTTS API HTTP ${res.statusCode}: ${Buffer.concat(chunks).toString().slice(0, 200)}`)); } }); }); req.on("error", reject); req.on("timeout", () => { req.destroy(); reject(new Error("XTTS API Timeout (60s)")); }); req.write(body); req.end(); }); } // ── Voice Upload Handler ──────────────────────────── async function handleVoiceUpload(payload) { const { name, samples } = payload; if (!name || !samples || !Array.isArray(samples) || samples.length === 0) { log("Voice Upload: Ungueltige Daten"); return; } log(`Voice Upload: "${name}" (${samples.length} Samples)`); try { // Alle Samples zusammenfuegen const buffers = samples.map(s => Buffer.from(s.base64, "base64")); const combined = Buffer.concat(buffers); // Als WAV speichern fs.mkdirSync(VOICES_DIR, { recursive: true }); const filePath = path.join(VOICES_DIR, `${name.replace(/[^a-zA-Z0-9_-]/g, "_")}.wav`); fs.writeFileSync(filePath, combined); log(`Voice gespeichert: ${filePath} (${(combined.length / 1024).toFixed(0)}KB)`); sendToRVS({ type: "xtts_voice_saved", payload: { name, size: combined.length, path: filePath }, timestamp: Date.now(), }); } catch (err) { log(`Voice Upload Fehler: ${err.message}`); } } // ── Voice List Handler ────────────────────────────── async function handleListVoices() { try { const files = fs.existsSync(VOICES_DIR) ? fs.readdirSync(VOICES_DIR).filter(f => f.endsWith(".wav")) : []; const voices = files.map(f => ({ name: path.basename(f, ".wav"), file: f, size: fs.statSync(path.join(VOICES_DIR, f)).size, })); log(`Stimmen: ${voices.length} verfuegbar`); sendToRVS({ type: "xtts_voices_list", payload: { voices }, timestamp: Date.now(), }); } catch (err) { log(`Stimmen-Liste Fehler: ${err.message}`); } } // ── RVS senden ────────────────────────────────────── function sendToRVS(msg) { if (rvsWs && rvsWs.readyState === WebSocket.OPEN) { rvsWs.send(JSON.stringify(msg)); } } // ── Start ─────────────────────────────────────────── log("ARIA XTTS Bridge startet..."); log(`XTTS API: ${XTTS_API_URL}`); log(`RVS: ${RVS_HOST}:${RVS_PORT}`); // Warten bis XTTS API erreichbar ist function waitForXTTS(callback, attempts) { if (attempts <= 0) { log("XTTS API nicht erreichbar — starte trotzdem"); callback(); return; } http.get(`${XTTS_API_URL}/docs`, (res) => { log(`XTTS API erreichbar (HTTP ${res.statusCode})`); callback(); }).on("error", () => { log(`XTTS API noch nicht bereit — warte (${attempts} Versuche uebrig)...`); setTimeout(() => waitForXTTS(callback, attempts - 1), 10000); // 10s statt 5s (Model laden dauert) }); } waitForXTTS(() => connectRVS(), 30); // Max 5min warten