diff --git a/xtts/Dockerfile b/xtts/Dockerfile deleted file mode 100644 index 6c79bc5..0000000 --- a/xtts/Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -FROM node:22-alpine -WORKDIR /app -COPY bridge.js package.json ./ -RUN npm install --production -CMD ["node", "bridge.js"] diff --git a/xtts/bridge.js b/xtts/bridge.js deleted file mode 100644 index 673309c..0000000 --- a/xtts/bridge.js +++ /dev/null @@ -1,596 +0,0 @@ -/** - * ARIA XTTS Bridge — Verbindet XTTS v2 Server mit dem RVS - * - * Empfaengt tts_request ueber RVS → rendert Audio via XTTS API → sendet zurueck - * Empfaengt voice_upload → speichert Voice-Sample fuer Cloning - * Empfaengt xtts_list_voices → listet verfuegbare Stimmen - */ - -const WebSocket = require("ws"); -const http = require("http"); -const https = require("https"); -const fs = require("fs"); -const path = require("path"); - -const XTTS_API_URL = process.env.XTTS_API_URL || "http://xtts:8000"; -const RVS_HOST = process.env.RVS_HOST || ""; -const RVS_PORT = process.env.RVS_PORT || "443"; -const RVS_TLS = process.env.RVS_TLS || "true"; -const RVS_TLS_FALLBACK = process.env.RVS_TLS_FALLBACK || "true"; -const RVS_TOKEN = process.env.RVS_TOKEN || ""; -const VOICES_DIR = "/voices"; - -function log(msg) { - console.log(`[${new Date().toISOString()}] ${msg}`); -} - -// ── RVS Verbindung ────────────────────────────────── - -let rvsWs = null; -let retryDelay = 2; - -function connectRVS(forcePlain) { - if (!RVS_HOST || !RVS_TOKEN) { - log("RVS nicht konfiguriert — beende"); - process.exit(1); - } - - const useTls = RVS_TLS === "true" && !forcePlain; - const proto = useTls ? "wss" : "ws"; - const url = `${proto}://${RVS_HOST}:${RVS_PORT}?token=${RVS_TOKEN}`; - - log(`Verbinde zu RVS: ${proto}://${RVS_HOST}:${RVS_PORT}`); - - const ws = new WebSocket(url); - - ws.on("open", () => { - log("RVS verbunden — warte auf TTS-Requests"); - rvsWs = ws; - retryDelay = 2; - - // Keepalive - setInterval(() => { - if (ws.readyState === WebSocket.OPEN) { - ws.ping(); - ws.send(JSON.stringify({ type: "heartbeat", timestamp: Date.now() })); - } - }, 25000); - }); - - ws.on("message", async (raw) => { - try { - const msg = JSON.parse(raw.toString()); - - if (msg.type === "xtts_request") { - await handleTTSRequest(msg.payload); - } else if (msg.type === "voice_upload") { - await handleVoiceUpload(msg.payload); - } else if (msg.type === "xtts_list_voices") { - await handleListVoices(); - } else if (msg.type === "xtts_delete_voice") { - await handleDeleteVoice(msg.payload); - } else if (msg.type === "voice_preload") { - await handleVoicePreload(msg.payload); - } else if (msg.type === "config") { - // Diagnostic hat globale Voice gewechselt → Preload damit der naechste - // Render ohne Ladewartezeit startet + alle Clients "voice_ready" sehen - const v = msg.payload && msg.payload.xttsVoice; - if (v && v !== lastDiagnosticVoice) { - lastDiagnosticVoice = v; - await handleVoicePreload({ voice: v, source: "diagnostic" }); - } else if (!v) { - lastDiagnosticVoice = ""; - } - } - } catch (err) { - log(`Fehler: ${err.message}`); - } - }); - - ws.on("close", () => { - log("RVS Verbindung geschlossen"); - rvsWs = null; - setTimeout(() => connectRVS(), Math.min(retryDelay * 1000, 30000)); - retryDelay = Math.min(retryDelay * 2, 30); - }); - - ws.on("error", (err) => { - log(`RVS Fehler: ${err.message}`); - if (useTls && RVS_TLS_FALLBACK === "true") { - log("TLS fehlgeschlagen — Fallback auf ws://"); - ws.removeAllListeners(); - try { ws.close(); } catch (_) {} - connectRVS(true); - } - }); -} - -// ── TTS Request Handler ───────────────────────────── - -/** - * Linearer Fade-In auf einen base64-PCM-Chunk (s16le). - * Mascht XTTS-Warmup-Glitches am Anfang eines Renders. - */ -function applyFadeIn(base64Pcm, sampleRate, channels, fadeMs) { - const buf = Buffer.from(base64Pcm, "base64"); - const totalSamples = buf.length / 2; // s16le - const fadeSamples = Math.min( - Math.floor((fadeMs / 1000) * sampleRate) * channels, - totalSamples - ); - for (let i = 0; i < fadeSamples; i++) { - const sample = buf.readInt16LE(i * 2); - const gain = i / fadeSamples; - buf.writeInt16LE(Math.round(sample * gain), i * 2); - } - return buf.toString("base64"); -} - -// ── TTS-Queue ────────────────────────────────────── -// XTTS verarbeitet Requests sequenziell, damit Streams sich nicht ueberlappen. -// Ohne Queue wuerden parallele Requests parallel streamen → App bekommt -// interleaved PCM-Chunks aus zwei Rendern → klingt wie Chaos. -let ttsQueue = Promise.resolve(); - -// Merkt sich die letzte in Diagnostic gewaehlte Voice, damit wir nicht bei jedem -// config-Broadcast (auch ohne Aenderung) einen Preload triggern. -let lastDiagnosticVoice = ""; - -function handleTTSRequest(payload) { - ttsQueue = ttsQueue.then(() => _runTTSRequest(payload)).catch(err => { - log(`TTS-Queue Fehler: ${err.message}`); - }); - return ttsQueue; -} - -async function _runTTSRequest(payload) { - const { text, voice, requestId, language, messageId } = payload; - if (!text) return; - - // Markdown-Cleanup (Bridge macht jetzt auch Cleanup, aber safety net) - let cleanText = text - .replace(/\*\*([^*]+)\*\*/g, "$1") - .replace(/\*([^*]+)\*/g, "$1") - .replace(/`([^`]+)`/g, "$1") - .replace(/```[\s\S]*?```/g, "") - .replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") - .replace(/#{1,6}\s*/g, "") - .replace(/>\s*/g, "") - .replace(/[-*]\s+/g, "") - .replace(/\n{2,}/g, ". ") - .replace(/\n/g, ", ") - .replace(/\s{2,}/g, " ") - .replace(/["""„]/g, "") - .replace(/\(\)/g, "") - .trim(); - - log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`); - - try { - // Im local-Mode erwartet daswer123 XTTS speaker_wav als Basename (ohne .wav, - // ohne Pfad) — der Server prefixt EXAMPLE_FOLDER selbst. Wir checken hier - // nur das physische File ab um Warnungen zu loggen; runter ans API geht - // nur der Name. - const voiceFilePath = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null; - const hasCustomVoice = voiceFilePath && fs.existsSync(voiceFilePath); - const speakerName = hasCustomVoice ? voice : ""; - if (voice && !hasCustomVoice) { - log(`WARNUNG: Voice "${voice}" angefordert, aber ${voiceFilePath} existiert nicht — nehme Default`); - } else if (hasCustomVoice) { - log(`Voice "${voice}" verwendet (speaker_wav="${speakerName}")`); - } - - let chunkIndex = 0; - let pcmMeta = null; - let firstChunkSeen = false; - - const onChunk = (pcmBase64, meta) => { - if (!pcmMeta) pcmMeta = meta; - let outBase64 = pcmBase64; - // Fade-In auf den ersten Chunk — maskiert XTTS-Warmup-Glitches - // (autoregressiver Generator hat am Anfang wenig Kontext → Artefakte). - if (!firstChunkSeen && pcmBase64) { - firstChunkSeen = true; - outBase64 = applyFadeIn(pcmBase64, meta.sampleRate, meta.channels, 120); - } - sendToRVS({ - type: "audio_pcm", - payload: { - requestId: requestId || "", - messageId: messageId || "", - base64: outBase64, - format: "pcm_s16le", - sampleRate: meta.sampleRate, - channels: meta.channels, - voice: voice || "default", - chunk: chunkIndex++, - final: false, - }, - timestamp: Date.now(), - }); - }; - - // /tts_stream fuer echtes Streaming (funktioniert im XTTS local-Mode). - // Wenn Server im apiManual/api-Mode laeuft: 400 → Fallback auf /tts_to_audio/. - try { - await streamXTTSAsPCM( - cleanText, - language || "de", - speakerName, - onChunk, - ); - } catch (streamErr) { - log(`/tts_stream fehlgeschlagen (${streamErr.message.slice(0, 100)}) — Fallback /tts_to_audio/`); - await streamXTTSBatch( - cleanText, - language || "de", - speakerName, - onChunk, - ); - } - - // Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann - if (pcmMeta) { - sendToRVS({ - type: "audio_pcm", - payload: { - requestId: requestId || "", - messageId: messageId || "", - base64: "", - format: "pcm_s16le", - sampleRate: pcmMeta.sampleRate, - channels: pcmMeta.channels, - voice: voice || "default", - chunk: chunkIndex++, - final: true, - }, - timestamp: Date.now(), - }); - } - - log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`); - } catch (err) { - log(`TTS Fehler: ${err.message}`); - sendToRVS({ - type: "xtts_response", - payload: { requestId, error: err.message }, - timestamp: Date.now(), - }); - } -} - -/** - * Ruft /tts_stream auf — echter Streaming-Endpoint bei daswer123. - * Schickt was der Server verlangt (allow: GET), aber mit JSON-Body - * als POST scheitert mit 405. Manche Versionen wollen GET + Query, - * andere POST + JSON. Testen was funktioniert. - */ -function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) { - return new Promise((resolve, reject) => { - // Wichtig: speaker_wav MUSS als Query-Key dabei sein (Pydantic required) — - // auch bei default-voice mit leerem Wert. Sonst gibt's HTTP 422. - // stream_chunk_size=250: grosse Chunks = wenige Chunk-Grenzen = wenig - // Render-Artefakte. daswer123 erzeugt an Chunk-Boundaries haeufig Glitches - // in den Worten die ueber die Grenze gehen. Hoehere Latenz ist OK. - const qs = new URLSearchParams(); - qs.set("text", text); - qs.set("language", language || "de"); - qs.set("speaker_wav", speakerWav || ""); - qs.set("stream_chunk_size", "250"); - - const url = new URL(XTTS_API_URL); - const fullPath = `/tts_stream?${qs.toString()}`; - const options = { - hostname: url.hostname, - port: url.port || 80, - path: fullPath, - method: "GET", - timeout: 60000, - }; - - log(`TTS GET /tts_stream?text=${text.slice(0, 30)}... (voice=${speakerWav ? "custom" : "default"})`); - - const req = http.request(options, (res) => { - if (res.statusCode !== 200) { - let body = ""; - res.on("data", (d) => { body += d.toString(); }); - res.on("end", () => { - log(`XTTS /tts_stream ${res.statusCode}: ${body.slice(0, 300)}`); - reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`)); - }); - return; - } - log(`TTS stream verbunden, empfange PCM...`); - - let headerParsed = false; - let sampleRate = 24000; - let channels = 1; - let leftover = Buffer.alloc(0); // ungerade Byte-Reste fuer das naechste Chunk - const HEADER_BYTES = 44; - let headerBuf = Buffer.alloc(0); - const PCM_CHUNK_BYTES = 8192; // ~170ms bei 24kHz s16 mono - - res.on("data", (chunk) => { - let data = chunk; - - // WAV-Header konsumieren (44 Bytes) - if (!headerParsed) { - headerBuf = Buffer.concat([headerBuf, data]); - if (headerBuf.length < HEADER_BYTES) return; - // Header lesen - const header = headerBuf.slice(0, HEADER_BYTES); - try { - channels = header.readUInt16LE(22); - sampleRate = header.readUInt32LE(24); - } catch (_) {} - headerParsed = true; - data = headerBuf.slice(HEADER_BYTES); - } - - // leftover aus vorherigem Chunk + neuer data - let combined = Buffer.concat([leftover, data]); - - // In PCM_CHUNK_BYTES-Happen zerlegen (Vielfache von 2 damit keine Sample-Splits) - while (combined.length >= PCM_CHUNK_BYTES) { - const slice = combined.slice(0, PCM_CHUNK_BYTES); - combined = combined.slice(PCM_CHUNK_BYTES); - onPcmChunk(slice.toString("base64"), { sampleRate, channels }); - } - leftover = combined; - }); - - res.on("end", () => { - // Rest-Daten senden - if (leftover.length > 0) { - onPcmChunk(leftover.toString("base64"), { sampleRate, channels }); - } - resolve(); - }); - - res.on("error", reject); - }); - - req.on("error", reject); - req.on("timeout", () => { req.destroy(); reject(new Error("XTTS API Timeout (60s)")); }); - req.end(); - }); -} - -/** - * Fallback: /tts_to_audio/ (POST JSON) — rendert komplett, dann response. - * Kein echtes Streaming, aber stabil als Backup wenn /tts_stream nicht geht. - * Shared chunking-Logik mit streamXTTSAsPCM — parst WAV-Header, stueckelt PCM. - */ -function streamXTTSBatch(text, language, speakerWav, onPcmChunk) { - return new Promise((resolve, reject) => { - const body = JSON.stringify({ - text, - language: language || "de", - speaker_wav: speakerWav || "", - }); - const url = new URL(XTTS_API_URL); - const options = { - hostname: url.hostname, - port: url.port || 80, - path: "/tts_to_audio/", - method: "POST", - headers: { - "Content-Type": "application/json", - "Content-Length": Buffer.byteLength(body), - }, - timeout: 60000, - }; - - const req = http.request(options, (res) => { - if (res.statusCode !== 200) { - let rb = ""; - res.on("data", (d) => { rb += d.toString(); }); - res.on("end", () => reject(new Error(`XTTS Batch HTTP ${res.statusCode}: ${rb.slice(0, 200)}`))); - return; - } - let headerParsed = false; - let sampleRate = 24000; - let channels = 1; - let leftover = Buffer.alloc(0); - let headerBuf = Buffer.alloc(0); - const HEADER_BYTES = 44; - const PCM_CHUNK_BYTES = 8192; - - res.on("data", (chunk) => { - let data = chunk; - if (!headerParsed) { - headerBuf = Buffer.concat([headerBuf, data]); - if (headerBuf.length < HEADER_BYTES) return; - const header = headerBuf.slice(0, HEADER_BYTES); - try { channels = header.readUInt16LE(22); sampleRate = header.readUInt32LE(24); } catch (_) {} - headerParsed = true; - data = headerBuf.slice(HEADER_BYTES); - } - let combined = Buffer.concat([leftover, data]); - while (combined.length >= PCM_CHUNK_BYTES) { - const slice = combined.slice(0, PCM_CHUNK_BYTES); - combined = combined.slice(PCM_CHUNK_BYTES); - onPcmChunk(slice.toString("base64"), { sampleRate, channels }); - } - leftover = combined; - }); - res.on("end", () => { - if (leftover.length > 0) onPcmChunk(leftover.toString("base64"), { sampleRate, channels }); - resolve(); - }); - res.on("error", reject); - }); - req.on("error", reject); - req.on("timeout", () => { req.destroy(); reject(new Error("XTTS Batch Timeout (60s)")); }); - req.write(body); - req.end(); - }); -} - -// ── Voice Upload Handler ──────────────────────────── - -async function handleVoiceUpload(payload) { - const { name, samples } = payload; - if (!name || !samples || !Array.isArray(samples) || samples.length === 0) { - log("Voice Upload: Ungueltige Daten"); - return; - } - - log(`Voice Upload: "${name}" (${samples.length} Samples)`); - - try { - // Alle Samples zusammenfuegen - const buffers = samples.map(s => Buffer.from(s.base64, "base64")); - const combined = Buffer.concat(buffers); - - // Als WAV speichern - fs.mkdirSync(VOICES_DIR, { recursive: true }); - const filePath = path.join(VOICES_DIR, `${name.replace(/[^a-zA-Z0-9_-]/g, "_")}.wav`); - fs.writeFileSync(filePath, combined); - - log(`Voice gespeichert: ${filePath} (${(combined.length / 1024).toFixed(0)}KB)`); - - sendToRVS({ - type: "xtts_voice_saved", - payload: { name, size: combined.length, path: filePath }, - timestamp: Date.now(), - }); - } catch (err) { - log(`Voice Upload Fehler: ${err.message}`); - } -} - -// ── Voice Delete Handler ──────────────────────────── - -async function handleDeleteVoice(payload) { - const { name } = payload || {}; - if (!name || typeof name !== "string") { - log("Voice Delete: ungueltiger Name"); - return; - } - const safe = name.replace(/[^a-zA-Z0-9_-]/g, "_"); - const filePath = path.join(VOICES_DIR, `${safe}.wav`); - try { - if (fs.existsSync(filePath)) { - fs.unlinkSync(filePath); - log(`Voice geloescht: ${filePath}`); - } else { - log(`Voice Delete: Datei existiert nicht (${filePath})`); - } - // Aktualisierte Liste an alle Clients senden - await handleListVoices(); - } catch (err) { - log(`Voice Delete Fehler: ${err.message}`); - } -} - -// ── Voice List Handler ────────────────────────────── - -/** - * Preload einer Stimme — rendert stumm ein kurzes Dummy-Audio, damit XTTS - * die Speaker-Latents laedt und der naechste echte Request ohne Wartezeit - * loslegen kann. Broadcastet "voice_ready" wenn fertig (oder mit error). - */ -async function handleVoicePreload(payload) { - const voice = (payload && payload.voice) || ""; - const source = (payload && payload.source) || "unknown"; - const requestId = (payload && payload.requestId) || ""; - log(`Voice-Preload angefordert: "${voice}" (source=${source})`); - - try { - let speakerName = ""; - if (voice) { - const voiceFilePath = path.join(VOICES_DIR, `${voice}.wav`); - if (!fs.existsSync(voiceFilePath)) { - sendToRVS({ - type: "voice_ready", - payload: { voice, requestId, error: "voice-file-not-found" }, - timestamp: Date.now(), - }); - log(`Preload abgebrochen: ${voiceFilePath} existiert nicht`); - return; - } - speakerName = voice; - } - - // Dummy-Request via Queue — damit sich Preload nicht mit echtem TTS ueberholt. - const t0 = Date.now(); - await new Promise((resolve, reject) => { - ttsQueue = ttsQueue.then(async () => { - try { - await streamXTTSAsPCM("ja.", "de", speakerName, () => {}); - resolve(); - } catch (err) { - reject(err); - } - }).catch(reject); - }); - const ms = Date.now() - t0; - log(`Voice "${voice || "default"}" geladen in ${ms}ms`); - - sendToRVS({ - type: "voice_ready", - payload: { voice, requestId, loadMs: ms }, - timestamp: Date.now(), - }); - } catch (err) { - log(`Voice-Preload Fehler: ${err.message}`); - sendToRVS({ - type: "voice_ready", - payload: { voice, requestId, error: err.message.slice(0, 200) }, - timestamp: Date.now(), - }); - } -} - -async function handleListVoices() { - try { - const files = fs.existsSync(VOICES_DIR) - ? fs.readdirSync(VOICES_DIR).filter(f => f.endsWith(".wav")) - : []; - - const voices = files.map(f => ({ - name: path.basename(f, ".wav"), - file: f, - size: fs.statSync(path.join(VOICES_DIR, f)).size, - })); - - log(`Stimmen: ${voices.length} verfuegbar`); - - sendToRVS({ - type: "xtts_voices_list", - payload: { voices }, - timestamp: Date.now(), - }); - } catch (err) { - log(`Stimmen-Liste Fehler: ${err.message}`); - } -} - -// ── RVS senden ────────────────────────────────────── - -function sendToRVS(msg) { - if (rvsWs && rvsWs.readyState === WebSocket.OPEN) { - rvsWs.send(JSON.stringify(msg)); - } -} - -// ── Start ─────────────────────────────────────────── - -log("ARIA XTTS Bridge startet..."); -log(`XTTS API: ${XTTS_API_URL}`); -log(`RVS: ${RVS_HOST}:${RVS_PORT}`); - -// Warten bis XTTS API erreichbar ist -function waitForXTTS(callback, attempts) { - if (attempts <= 0) { log("XTTS API nicht erreichbar — starte trotzdem"); callback(); return; } - http.get(`${XTTS_API_URL}/docs`, (res) => { - log(`XTTS API erreichbar (HTTP ${res.statusCode})`); - callback(); - }).on("error", () => { - log(`XTTS API noch nicht bereit — warte (${attempts} Versuche uebrig)...`); - setTimeout(() => waitForXTTS(callback, attempts - 1), 10000); // 10s statt 5s (Model laden dauert) - }); -} - -waitForXTTS(() => connectRVS(), 30); // Max 5min warten diff --git a/xtts/docker-compose.yml b/xtts/docker-compose.yml index b5dac87..c6bf89d 100644 --- a/xtts/docker-compose.yml +++ b/xtts/docker-compose.yml @@ -1,7 +1,7 @@ # ════════════════════════════════════════════════ -# ARIA XTTS v2 — GPU TTS Server +# ARIA Gamebox Stack — GPU F5-TTS + Whisper STT # Laeuft auf dem Gaming-PC (RTX 3060) -# Verbindet sich zum RVS fuer TTS-Requests +# Verbindet sich zum RVS fuer TTS/STT-Requests # ════════════════════════════════════════════════ # # Voraussetzungen: @@ -10,15 +10,18 @@ # - .env mit RVS-Verbindungsdaten # # Start: docker compose up -d -# Test: curl http://localhost:8000/docs # ════════════════════════════════════════════════ services: - # ─── XTTS v2 API Server (GPU) ───────────────── - xtts: - image: daswer123/xtts-api-server:latest - container_name: aria-xtts + # ─── F5-TTS Bridge (GPU) ────────────────────── + # Ersetzt den frueheren XTTS-Stack. Empfaengt xtts_request via RVS, + # rendert via F5-TTS mit Voice-Cloning, streamt PCM an die App. + # Voice-Upload: speichert WAV und laesst whisper-bridge den Referenz- + # text transkribieren — der User muss nichts eintippen. + f5tts-bridge: + build: ./f5tts + container_name: aria-f5tts-bridge deploy: resources: reservations: @@ -26,45 +29,29 @@ services: - driver: nvidia count: 1 capabilities: [gpu] - ports: - - "8000:8020" volumes: - - xtts-models:/app/xtts_models # Model-Cache (~2GB) - - ./voices:/voices # Custom Voice Samples + - ./voices:/voices # WAV + TXT Referenz + - f5tts-models:/root/.cache/huggingface # Model-Cache persistieren environment: - - COQUI_TOS_AGREED=1 - # Local-Modus statt default "apiManual": Modell bleibt im GPU-VRAM, - # Render startet sofort, /tts_stream funktioniert. - # Default-CMD des Images liest diese ENV: -ms ${MODEL_SOURCE:-"apiManual"} - - MODEL_SOURCE=local - # Speaker-Folder auf unsere gemounteten voices zeigen lassen - - EXAMPLE_FOLDER=/voices - restart: unless-stopped - - # ─── XTTS Bridge (verbindet zu RVS) ─────────── - xtts-bridge: - build: . - container_name: aria-xtts-bridge - depends_on: - - xtts - volumes: - - ./voices:/voices # Shared mit XTTS-Server - environment: - - XTTS_API_URL=http://xtts:8020 - RVS_HOST=${RVS_HOST} - RVS_PORT=${RVS_PORT:-443} - RVS_TLS=${RVS_TLS:-true} - RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true} - RVS_TOKEN=${RVS_TOKEN} + - F5TTS_MODEL=${F5TTS_MODEL:-F5TTS_v1_Base} + - F5TTS_DEVICE=${F5TTS_DEVICE:-cuda} + - VOICES_DIR=/voices restart: unless-stopped # ─── Whisper STT (GPU) ──────────────────────── # Faster-Whisper auf der Gamebox statt auf der VM (CPU) — # deutlich schneller. Verbindet sich selbst per WebSocket an # den RVS und nimmt dort stt_request Nachrichten der aria-bridge - # entgegen, antwortet mit stt_response. Laedt das Modell beim - # Start vor; auf Config-Broadcasts (Diagnostic → whisperModel) - # wird zur Laufzeit hot-swapped. + # entgegen, antwortet mit stt_response. Zusaetzlich nutzt die + # f5tts-bridge Whisper intern fuer die Referenz-Transkription bei + # Voice-Uploads. Laedt das Modell beim Start vor; auf Config- + # Broadcasts (Diagnostic → whisperModel) wird zur Laufzeit hot- + # swapped. whisper-bridge: build: ./whisper container_name: aria-whisper-bridge @@ -86,9 +73,9 @@ services: - WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16} - WHISPER_LANGUAGE=${WHISPER_LANGUAGE:-de} volumes: - - whisper-models:/root/.cache/huggingface # Model-Cache persistieren + - whisper-models:/root/.cache/huggingface restart: unless-stopped volumes: - xtts-models: + f5tts-models: whisper-models: diff --git a/xtts/f5tts/Dockerfile b/xtts/f5tts/Dockerfile new file mode 100644 index 0000000..81c08ba --- /dev/null +++ b/xtts/f5tts/Dockerfile @@ -0,0 +1,21 @@ +FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip ffmpeg git \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# PyTorch CUDA-Wheels zuerst (f5-tts zieht sonst CPU-only Torch rein) +RUN pip3 install --no-cache-dir torch==2.3.1 torchaudio==2.3.1 \ + --index-url https://download.pytorch.org/whl/cu121 + +COPY requirements.txt . +RUN pip3 install --no-cache-dir -r requirements.txt + +COPY bridge.py . + +CMD ["python3", "bridge.py"] diff --git a/xtts/f5tts/bridge.py b/xtts/f5tts/bridge.py new file mode 100644 index 0000000..737b4ec --- /dev/null +++ b/xtts/f5tts/bridge.py @@ -0,0 +1,580 @@ +#!/usr/bin/env python3 +""" +ARIA F5-TTS Bridge — laeuft auf der Gamebox (RTX 3060). + +Empfaengt xtts_request via RVS → F5-TTS Voice Cloning auf GPU → streamt +16-bit PCM Chunks als audio_pcm Nachrichten zurueck an die App. + +Voice-Layout im VOICES_DIR: + {name}.wav — Referenz-Audio (6-10s, 24kHz mono empfohlen) + {name}.txt — Referenz-Text (UTF-8, was im WAV gesprochen wird) + +Beim voice_upload senden wir intern einen stt_request an die whisper-bridge +und legen die Transkription als .txt ab — der User muss keinen Text eingeben. + +Env: + RVS_HOST, RVS_PORT, RVS_TLS, RVS_TLS_FALLBACK, RVS_TOKEN + F5TTS_MODEL Default: F5TTS_v1_Base + F5TTS_DEVICE Default: cuda + VOICES_DIR Default: /voices +""" +import asyncio +import base64 +import json +import logging +import os +import re +import subprocess +import sys +import tempfile +import time +import uuid +from pathlib import Path +from typing import Optional + +import numpy as np +import soundfile as sf +import websockets + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S", +) +logger = logging.getLogger("f5tts-bridge") +# HuggingFace + Torch download-Logs etwas daempfen +logging.getLogger("httpx").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) + +RVS_HOST = os.getenv("RVS_HOST", "").strip() +RVS_PORT = int(os.getenv("RVS_PORT", "443")) +RVS_TLS = os.getenv("RVS_TLS", "true").lower() == "true" +RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true").lower() == "true" +RVS_TOKEN = os.getenv("RVS_TOKEN", "").strip() + +F5TTS_MODEL = os.getenv("F5TTS_MODEL", "F5TTS_v1_Base") +F5TTS_DEVICE = os.getenv("F5TTS_DEVICE", "cuda") +VOICES_DIR = Path(os.getenv("VOICES_DIR", "/voices")) + +PCM_CHUNK_BYTES = 8192 # ~170ms @ 24kHz mono s16 +TARGET_SR = 24000 # F5-TTS native + +# ── Lazy F5-TTS Loader ────────────────────────────────────── + +_F5TTS_cls = None + + +def _get_f5tts_cls(): + """Lazy import damit Startup-Logs nicht durch Torch-Warnungen zumuellen.""" + global _F5TTS_cls + if _F5TTS_cls is None: + from f5_tts.api import F5TTS as _cls + _F5TTS_cls = _cls + return _F5TTS_cls + + +class F5Runner: + """Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking).""" + + def __init__(self) -> None: + self.model = None + self._lock = asyncio.Lock() + + def _load_blocking(self) -> None: + cls = _get_f5tts_cls() + logger.info("Lade F5-TTS '%s' (device=%s)...", F5TTS_MODEL, F5TTS_DEVICE) + t0 = time.time() + self.model = cls(model=F5TTS_MODEL, device=F5TTS_DEVICE) + logger.info("F5-TTS geladen in %.1fs", time.time() - t0) + + async def ensure_loaded(self) -> None: + async with self._lock: + if self.model is not None: + return + loop = asyncio.get_event_loop() + await loop.run_in_executor(None, self._load_blocking) + + def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]: + wav, sr, _ = self.model.infer( + ref_file=ref_wav, + ref_text=ref_text, + gen_text=gen_text, + remove_silence=True, + seed=-1, + ) + # F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard + if not isinstance(wav, np.ndarray): + wav = np.asarray(wav, dtype=np.float32) + if wav.ndim > 1: + wav = wav.squeeze() + return wav.astype(np.float32), int(sr) + + async def synthesize(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]: + await self.ensure_loaded() + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, self._infer_blocking, gen_text, ref_wav, ref_text) + + +# ── Helpers ───────────────────────────────────────────────── + +_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+") + + +def split_sentences(text: str, max_len: int = 350) -> list[str]: + """Teilt langen Text an Satzgrenzen. Kurze Texte bleiben als-is.""" + text = text.strip() + if not text: + return [] + if len(text) <= max_len: + return [text] + parts = [p.strip() for p in _SENTENCE_SPLIT.split(text) if p.strip()] + # Zu kurze Fragmente mergen damit F5-TTS nicht an jedem Komma neu startet + merged: list[str] = [] + buf = "" + for p in parts: + if len(buf) + len(p) + 1 <= max_len: + buf = f"{buf} {p}".strip() + else: + if buf: + merged.append(buf) + buf = p + if buf: + merged.append(buf) + return merged or [text] + + +def float_to_pcm16(wav: np.ndarray) -> bytes: + """Float32 (-1..+1) → int16 little-endian bytes.""" + wav = np.clip(wav, -1.0, 1.0) + pcm = (wav * 32767.0).astype(np.int16) + return pcm.tobytes() + + +def sanitize_voice_name(name: str) -> str: + return re.sub(r"[^a-zA-Z0-9_-]", "_", name) + + +def voice_paths(name: str) -> tuple[Path, Path]: + safe = sanitize_voice_name(name) + return VOICES_DIR / f"{safe}.wav", VOICES_DIR / f"{safe}.txt" + + +def ensure_24k_mono_wav(src_wav: Path) -> Path: + """F5-TTS moechte 24kHz mono als Referenz — ffmpeg konvertiert inplace. + + Wenn das File schon passt, wird nichts geaendert. Sonst wird es + reingeschrieben (Original wird ueberschrieben). + """ + try: + info = sf.info(str(src_wav)) + if info.samplerate == TARGET_SR and info.channels == 1: + return src_wav + except Exception: + pass + tmp_out = src_wav.with_suffix(".conv.wav") + cmd = ["ffmpeg", "-y", "-i", str(src_wav), + "-ar", str(TARGET_SR), "-ac", "1", "-f", "wav", str(tmp_out)] + r = subprocess.run(cmd, capture_output=True, timeout=30) + if r.returncode != 0: + logger.warning("ffmpeg-Konvertierung von %s fehlgeschlagen: %s", + src_wav, r.stderr.decode(errors="replace")[:200]) + try: + tmp_out.unlink() + except OSError: + pass + return src_wav + os.replace(tmp_out, src_wav) + return src_wav + + +async def _send(ws, mtype: str, payload: dict) -> None: + try: + await ws.send(json.dumps({ + "type": mtype, + "payload": payload, + "timestamp": int(time.time() * 1000), + })) + except Exception as e: + logger.warning("Send fehlgeschlagen (%s): %s", mtype, e) + + +# ── Interne Transkription via whisper-bridge ──────────────── + +_pending_stt: dict[str, asyncio.Future] = {} +_STT_TIMEOUT_S = 60.0 + + +async def request_transcription(ws, wav_path: Path, language: str = "de") -> Optional[str]: + """Sendet einen stt_request an die whisper-bridge (ueber RVS) und wartet auf stt_response.""" + try: + with open(wav_path, "rb") as f: + audio_b64 = base64.b64encode(f.read()).decode("ascii") + except Exception as e: + logger.error("Lesen %s fehlgeschlagen: %s", wav_path, e) + return None + + request_id = str(uuid.uuid4()) + loop = asyncio.get_event_loop() + fut: asyncio.Future = loop.create_future() + _pending_stt[request_id] = fut + + try: + await _send(ws, "stt_request", { + "requestId": request_id, + "audio": audio_b64, + "mimeType": "audio/wav", + "model": "small", # klein reicht fuer Voice-Referenz + "language": language, + }) + return await asyncio.wait_for(fut, timeout=_STT_TIMEOUT_S) + except asyncio.TimeoutError: + logger.warning("Transkription Timeout fuer %s", wav_path.name) + return None + except Exception as e: + logger.warning("Transkription Fehler: %s", e) + return None + finally: + _pending_stt.pop(request_id, None) + + +# ── TTS-Request Handler ───────────────────────────────────── + +# Queue damit sich parallele Requests nicht ueberlappen (GPU-Throughput) +_tts_queue: asyncio.Queue[tuple] = asyncio.Queue() + + +async def _tts_worker(ws, runner: F5Runner) -> None: + """Serialisiert Synthesen — GPU kann sonst OOM gehen.""" + while True: + text, voice, request_id, message_id, language = await _tts_queue.get() + try: + await _do_tts(ws, runner, text, voice, request_id, message_id, language) + except Exception: + logger.exception("TTS-Worker Fehler") + finally: + _tts_queue.task_done() + + +async def _do_tts(ws, runner: F5Runner, text: str, voice: str, + request_id: str, message_id: str, language: str) -> None: + t0 = time.time() + ref_wav_path, ref_txt_path = voice_paths(voice) if voice else (None, None) + has_custom = bool(voice and ref_wav_path and ref_wav_path.exists() and ref_txt_path.exists()) + if voice and not has_custom: + # Wenn nur WAV da ist aber kein txt → on-the-fly transkribieren + if ref_wav_path and ref_wav_path.exists() and (not ref_txt_path or not ref_txt_path.exists()): + logger.info("Voice '%s' hat kein txt — transkribiere on-the-fly", voice) + text_ref = await request_transcription(ws, ref_wav_path, language) + if text_ref: + try: + ref_txt_path.write_text(text_ref.strip(), encoding="utf-8") + has_custom = True + logger.info("Referenz-Text nachgezogen: '%s'", text_ref[:60]) + except Exception as e: + logger.warning("Referenz-Text speichern fehlgeschlagen: %s", e) + if not has_custom: + logger.warning("Voice '%s' nicht komplett (%s, txt=%s) — nehme Default", + voice, ref_wav_path, (ref_txt_path and ref_txt_path.exists())) + + if has_custom: + ref_wav_str = str(ref_wav_path) + ref_text = ref_txt_path.read_text(encoding="utf-8").strip() + else: + # Fallback: kein Custom-Voice. F5-TTS braucht IMMER eine Referenz, + # wir nehmen default_ref.wav/txt falls vorhanden, sonst die erste + # gefundene Voice im Ordner. + default_wav = VOICES_DIR / "default_ref.wav" + default_txt = VOICES_DIR / "default_ref.txt" + if default_wav.exists() and default_txt.exists(): + ref_wav_str = str(default_wav) + ref_text = default_txt.read_text(encoding="utf-8").strip() + else: + # Nimm irgendein vorhandenes voice-Paar + pair = next( + ((w, t) for w, t in ( + (v, v.with_suffix(".txt")) for v in VOICES_DIR.glob("*.wav") + ) if t.exists()), + None, + ) + if not pair: + logger.error("Keine Referenz-Stimme im VOICES_DIR — TTS abgebrochen") + return + ref_wav_str, ref_text = str(pair[0]), pair[1].read_text(encoding="utf-8").strip() + + sentences = split_sentences(text) + logger.info("F5-TTS: %d Satz(e), voice=%s (%s)", len(sentences), voice or "default", ref_wav_str) + + chunk_index = 0 + pcm_sr = TARGET_SR + for i, sent in enumerate(sentences): + try: + wav, sr = await runner.synthesize(sent, ref_wav_str, ref_text) + pcm_sr = sr + pcm_bytes = float_to_pcm16(wav) + # Erste PCM-Chunk des allerersten Satzes bekommt Fade-In (maskiert + # eventuelle Warmup-Glitches). Alle anderen Chunks bleiben wie sind. + if i == 0 and chunk_index == 0: + pcm_bytes = _fade_in_pcm16(pcm_bytes, sr, 120) + + # Stueckeln + for off in range(0, len(pcm_bytes), PCM_CHUNK_BYTES): + slice_ = pcm_bytes[off:off + PCM_CHUNK_BYTES] + await _send(ws, "audio_pcm", { + "requestId": request_id, + "messageId": message_id, + "base64": base64.b64encode(slice_).decode("ascii"), + "format": "pcm_s16le", + "sampleRate": sr, + "channels": 1, + "voice": voice or "default", + "chunk": chunk_index, + "final": False, + }) + chunk_index += 1 + except Exception as e: + logger.exception("F5-TTS Synthese-Fehler (Satz %d)", i) + await _send(ws, "xtts_response", { + "requestId": request_id, + "error": str(e)[:200], + }) + return + + # Final-Marker + await _send(ws, "audio_pcm", { + "requestId": request_id, + "messageId": message_id, + "base64": "", + "format": "pcm_s16le", + "sampleRate": pcm_sr, + "channels": 1, + "voice": voice or "default", + "chunk": chunk_index, + "final": True, + }) + + logger.info("TTS komplett: %d Chunks, %.2fs render (voice=%s, text=%d chars)", + chunk_index, time.time() - t0, voice or "default", len(text)) + + +def _fade_in_pcm16(pcm: bytes, sr: int, fade_ms: int) -> bytes: + """Linear Fade-In auf erste fade_ms — maskiert Warmup-Glitches.""" + arr = np.frombuffer(pcm, dtype=np.int16).copy() + fade_samples = min(int((fade_ms / 1000.0) * sr), len(arr)) + if fade_samples <= 0: + return pcm + ramp = np.linspace(0.0, 1.0, fade_samples, dtype=np.float32) + arr[:fade_samples] = (arr[:fade_samples].astype(np.float32) * ramp).astype(np.int16) + return arr.tobytes() + + +# ── Voice Management Handlers ─────────────────────────────── + +async def handle_voice_upload(ws, payload: dict) -> None: + name = (payload.get("name") or "").strip() + samples = payload.get("samples") or [] + if not name or not samples: + logger.warning("voice_upload: ungueltig (name=%r, samples=%d)", name, len(samples)) + return + logger.info("Voice-Upload: '%s' (%d Samples)", name, len(samples)) + + try: + VOICES_DIR.mkdir(parents=True, exist_ok=True) + safe = sanitize_voice_name(name) + wav_path = VOICES_DIR / f"{safe}.wav" + txt_path = VOICES_DIR / f"{safe}.txt" + + # Samples zusammenfuegen + buffers = [base64.b64decode(s.get("base64", "")) for s in samples] + with open(wav_path, "wb") as f: + for b in buffers: + f.write(b) + size_kb = wav_path.stat().st_size / 1024 + logger.info("Voice WAV gespeichert: %s (%.0fKB)", wav_path, size_kb) + + # Auf 24kHz mono normalisieren (falls App in anderem Format liefert) + ensure_24k_mono_wav(wav_path) + + # Transkription ueber whisper-bridge anfragen + logger.info("Transkribiere '%s' via whisper-bridge...", name) + text = await request_transcription(ws, wav_path, language="de") + if not text: + logger.warning("Transkription fehlgeschlagen — speichere Platzhalter-Text") + text = "Das ist ein Referenz Audio." + txt_path.write_text(text.strip(), encoding="utf-8") + logger.info("Voice '%s' komplett (txt: %s)", name, text[:80]) + + await _send(ws, "xtts_voice_saved", { + "name": name, "size": int(size_kb * 1024), "refText": text.strip(), + }) + # Liste aktualisieren + await handle_list_voices(ws) + except Exception as e: + logger.exception("voice_upload Fehler") + await _send(ws, "xtts_voice_saved", {"name": name, "error": str(e)[:200]}) + + +async def handle_list_voices(ws) -> None: + try: + voices = [] + if VOICES_DIR.exists(): + for wav in sorted(VOICES_DIR.glob("*.wav")): + txt = wav.with_suffix(".txt") + voices.append({ + "name": wav.stem, + "file": wav.name, + "size": wav.stat().st_size, + "hasRefText": txt.exists(), + }) + logger.info("Stimmen-Liste: %d", len(voices)) + await _send(ws, "xtts_voices_list", {"voices": voices}) + except Exception: + logger.exception("handle_list_voices Fehler") + + +async def handle_delete_voice(ws, payload: dict) -> None: + name = (payload.get("name") or "").strip() + if not name: + return + try: + wav, txt = voice_paths(name) + for p in (wav, txt): + if p.exists(): + p.unlink() + logger.info("Voice geloescht: %s", p) + await handle_list_voices(ws) + except Exception: + logger.exception("handle_delete_voice Fehler") + + +# Letzte diagnostisch-gesetzte Voice (verhindert Endlos-Preload bei jedem config) +_last_diag_voice = "" + + +async def handle_voice_preload(ws, payload: dict, runner: F5Runner) -> None: + voice = (payload.get("voice") or "").strip() + request_id = payload.get("requestId", "") + logger.info("Voice-Preload angefordert: '%s'", voice or "default") + + try: + ref_wav, ref_txt = voice_paths(voice) if voice else (None, None) + if voice and (not ref_wav or not ref_wav.exists()): + await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "error": "voice-file-not-found"}) + return + + # Ref-Text sicherstellen (falls nur WAV da ist) + if voice and ref_txt and not ref_txt.exists(): + text = await request_transcription(ws, ref_wav, language="de") + if text: + ref_txt.write_text(text.strip(), encoding="utf-8") + logger.info("Referenz-Text beim Preload nachgezogen") + + # Dummy-Render zum Warmup + t0 = time.time() + await _do_tts(ws, runner, "ja.", voice, f"preload-{request_id}", "", "de") + ms = int((time.time() - t0) * 1000) + await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "loadMs": ms}) + except Exception as e: + logger.exception("Voice-Preload Fehler") + await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "error": str(e)[:200]}) + + +# ── Haupt-Loop ────────────────────────────────────────────── + +async def run_loop(runner: F5Runner) -> None: + # Preload im Hintergrund starten damit der Startup nicht blockiert + asyncio.create_task(runner.ensure_loaded()) + + use_tls = RVS_TLS + retry_s = 2 + tls_fallback_tried = False + global _last_diag_voice + + while True: + scheme = "wss" if use_tls else "ws" + url = f"{scheme}://{RVS_HOST}:{RVS_PORT}/ws?token={RVS_TOKEN}" + masked = url.replace(RVS_TOKEN, "***") if RVS_TOKEN else url + + try: + logger.info("Verbinde zu RVS: %s", masked) + async with websockets.connect(url, ping_interval=20, ping_timeout=10, max_size=50 * 1024 * 1024) as ws: + logger.info("RVS verbunden") + retry_s = 2 + tls_fallback_tried = False + + # TTS-Worker fuer diese Verbindung starten + worker = asyncio.create_task(_tts_worker(ws, runner)) + + try: + async for raw in ws: + try: + msg = json.loads(raw) + except Exception: + continue + mtype = msg.get("type", "") + payload = msg.get("payload", {}) or {} + + if mtype == "xtts_request": + await _tts_queue.put(( + payload.get("text", ""), + payload.get("voice", "") or "", + payload.get("requestId", ""), + payload.get("messageId", ""), + payload.get("language", "de"), + )) + elif mtype == "voice_upload": + asyncio.create_task(handle_voice_upload(ws, payload)) + elif mtype == "xtts_list_voices": + asyncio.create_task(handle_list_voices(ws)) + elif mtype == "xtts_delete_voice": + asyncio.create_task(handle_delete_voice(ws, payload)) + elif mtype == "voice_preload": + asyncio.create_task(handle_voice_preload(ws, payload, runner)) + elif mtype == "stt_response": + # Antwort auf unseren internen Transkriptions-Request + req_id = payload.get("requestId", "") + fut = _pending_stt.get(req_id) + if fut and not fut.done(): + if payload.get("error"): + fut.set_result(None) + else: + fut.set_result(payload.get("text") or "") + elif mtype == "config": + v = (payload.get("xttsVoice") or "").strip() + if v and v != _last_diag_voice: + _last_diag_voice = v + asyncio.create_task(handle_voice_preload( + ws, {"voice": v, "source": "diagnostic"}, runner, + )) + elif not v: + _last_diag_voice = "" + finally: + worker.cancel() + try: + await worker + except asyncio.CancelledError: + pass + except Exception as e: + logger.warning("Verbindung verloren: %s", e) + if use_tls and RVS_TLS_FALLBACK and not tls_fallback_tried: + logger.info("TLS fehlgeschlagen — Fallback auf ws://") + use_tls = False + tls_fallback_tried = True + continue + await asyncio.sleep(min(retry_s, 30)) + retry_s = min(retry_s * 2, 30) + + +async def main() -> None: + if not RVS_HOST: + logger.error("RVS_HOST nicht gesetzt — Abbruch") + sys.exit(1) + VOICES_DIR.mkdir(parents=True, exist_ok=True) + runner = F5Runner() + await run_loop(runner) + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + sys.exit(0) diff --git a/xtts/f5tts/requirements.txt b/xtts/f5tts/requirements.txt new file mode 100644 index 0000000..97708c0 --- /dev/null +++ b/xtts/f5tts/requirements.txt @@ -0,0 +1,5 @@ +f5-tts>=1.0.0 +websockets>=12.0 +numpy>=1.24 +soundfile>=0.12 +requests>=2.31 diff --git a/xtts/package.json b/xtts/package.json deleted file mode 100644 index d4b8188..0000000 --- a/xtts/package.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "aria-xtts-bridge", - "version": "1.0.0", - "private": true, - "dependencies": { - "ws": "^8.16.0" - } -}