From c62ceafdc22cc4ba6d5eb729a155b85d505acab8 Mon Sep 17 00:00:00 2001 From: duffyduck Date: Wed, 22 Apr 2026 15:53:10 +0200 Subject: [PATCH] fix: XTTS-Endpoint mit Fallback-Chain + Diagnose-Logs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: /tts_stream hat bei User nicht funktioniert → keine Sprachausgabe mehr. Server hatte vorher 405 fuer POST geantwortet, meine Umstellung auf GET scheint aber einen anderen Fehler zu produzieren der nicht geloggt wurde. Fix: - streamXTTSAsPCM() = /tts_stream (GET, Streaming) mit ausfuehrlichem Error-Logging bei non-200 Response - streamXTTSBatch() = /tts_to_audio/ (POST, Batch) als Fallback - handleTTSRequest versucht Stream zuerst, bei Exception Fallback auf Batch — so gibt's IMMER Audio, auch wenn /tts_stream kaputt ist - Log zeigt welcher Pfad benutzt wurde Co-Authored-By: Claude Opus 4.7 (1M context) --- xtts/bridge.js | 168 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 128 insertions(+), 40 deletions(-) diff --git a/xtts/bridge.js b/xtts/bridge.js index 0a6da4e..d2783f5 100644 --- a/xtts/bridge.js +++ b/xtts/bridge.js @@ -138,31 +138,43 @@ async function _runTTSRequest(payload) { let chunkIndex = 0; let pcmMeta = null; - // EIN Request fuer den GANZEN Text — kein Gap zwischen Saetzen. - // XTTS rendert und wir streamen PCM sobald es reinkommt. - await streamXTTSAsPCM( - cleanText, - language || "de", - hasCustomVoice ? voiceSample : null, - (pcmBase64, meta) => { - if (!pcmMeta) pcmMeta = meta; - sendToRVS({ - type: "audio_pcm", - payload: { - requestId: requestId || "", - messageId: messageId || "", - base64: pcmBase64, - format: "pcm_s16le", - sampleRate: meta.sampleRate, - channels: meta.channels, - voice: voice || "default", - chunk: chunkIndex++, - final: false, - }, - timestamp: Date.now(), - }); - }, - ); + const onChunk = (pcmBase64, meta) => { + if (!pcmMeta) pcmMeta = meta; + sendToRVS({ + type: "audio_pcm", + payload: { + requestId: requestId || "", + messageId: messageId || "", + base64: pcmBase64, + format: "pcm_s16le", + sampleRate: meta.sampleRate, + channels: meta.channels, + voice: voice || "default", + chunk: chunkIndex++, + final: false, + }, + timestamp: Date.now(), + }); + }; + + // Erst /tts_stream (GET) versuchen — echter Streaming, schnell. + // Bei Fehler Fallback auf /tts_to_audio/ (POST) damit Audio trotzdem kommt. + try { + await streamXTTSAsPCM( + cleanText, + language || "de", + hasCustomVoice ? voiceSample : null, + onChunk, + ); + } catch (streamErr) { + log(`/tts_stream fehlgeschlagen (${streamErr.message}) — Fallback auf /tts_to_audio/`); + await streamXTTSBatch( + cleanText, + language || "de", + hasCustomVoice ? voiceSample : null, + onChunk, + ); + } // Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann if (pcmMeta) { @@ -195,37 +207,42 @@ async function _runTTSRequest(payload) { } /** - * Ruft /tts_stream (GET) auf — echter Streaming-Endpoint bei daswer123. - * Samples fliessen waehrend XTTS rendert (chunked transfer). - * Time-to-first-audio ~300-500ms statt 2-4s beim batch-Endpoint. - * - * Parameter werden als Query-String uebergeben (GET-API). + * Ruft /tts_stream auf — echter Streaming-Endpoint bei daswer123. + * Schickt was der Server verlangt (allow: GET), aber mit JSON-Body + * als POST scheitert mit 405. Manche Versionen wollen GET + Query, + * andere POST + JSON. Testen was funktioniert. */ function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) { return new Promise((resolve, reject) => { - const qs = new URLSearchParams({ - text, - language: language || "de", - speaker_wav: speakerWav ? speakerWav : "", - stream_chunk_size: "40", - }); + const qs = new URLSearchParams(); + qs.set("text", text); + qs.set("language", language || "de"); + if (speakerWav) qs.set("speaker_wav", speakerWav); + qs.set("stream_chunk_size", "40"); - const url = new URL(`${XTTS_API_URL}/tts_stream?${qs.toString()}`); + const url = new URL(XTTS_API_URL); + const fullPath = `/tts_stream?${qs.toString()}`; const options = { hostname: url.hostname, - port: url.port, - path: `${url.pathname}?${url.searchParams.toString()}`, + port: url.port || 80, + path: fullPath, method: "GET", timeout: 60000, }; + log(`TTS GET /tts_stream?text=${text.slice(0, 30)}... (voice=${speakerWav ? "custom" : "default"})`); + const req = http.request(options, (res) => { if (res.statusCode !== 200) { let body = ""; res.on("data", (d) => { body += d.toString(); }); - res.on("end", () => reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`))); + res.on("end", () => { + log(`XTTS /tts_stream ${res.statusCode}: ${body.slice(0, 300)}`); + reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`)); + }); return; } + log(`TTS stream verbunden, empfange PCM...`); let headerParsed = false; let sampleRate = 24000; @@ -281,6 +298,77 @@ function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) { }); } +/** + * Fallback: /tts_to_audio/ (POST JSON) — rendert komplett, dann response. + * Kein echtes Streaming, aber stabil als Backup wenn /tts_stream nicht geht. + * Shared chunking-Logik mit streamXTTSAsPCM — parst WAV-Header, stueckelt PCM. + */ +function streamXTTSBatch(text, language, speakerWav, onPcmChunk) { + return new Promise((resolve, reject) => { + const body = JSON.stringify({ + text, + language: language || "de", + speaker_wav: speakerWav || "", + }); + const url = new URL(XTTS_API_URL); + const options = { + hostname: url.hostname, + port: url.port || 80, + path: "/tts_to_audio/", + method: "POST", + headers: { + "Content-Type": "application/json", + "Content-Length": Buffer.byteLength(body), + }, + timeout: 60000, + }; + + const req = http.request(options, (res) => { + if (res.statusCode !== 200) { + let rb = ""; + res.on("data", (d) => { rb += d.toString(); }); + res.on("end", () => reject(new Error(`XTTS Batch HTTP ${res.statusCode}: ${rb.slice(0, 200)}`))); + return; + } + let headerParsed = false; + let sampleRate = 24000; + let channels = 1; + let leftover = Buffer.alloc(0); + let headerBuf = Buffer.alloc(0); + const HEADER_BYTES = 44; + const PCM_CHUNK_BYTES = 8192; + + res.on("data", (chunk) => { + let data = chunk; + if (!headerParsed) { + headerBuf = Buffer.concat([headerBuf, data]); + if (headerBuf.length < HEADER_BYTES) return; + const header = headerBuf.slice(0, HEADER_BYTES); + try { channels = header.readUInt16LE(22); sampleRate = header.readUInt32LE(24); } catch (_) {} + headerParsed = true; + data = headerBuf.slice(HEADER_BYTES); + } + let combined = Buffer.concat([leftover, data]); + while (combined.length >= PCM_CHUNK_BYTES) { + const slice = combined.slice(0, PCM_CHUNK_BYTES); + combined = combined.slice(PCM_CHUNK_BYTES); + onPcmChunk(slice.toString("base64"), { sampleRate, channels }); + } + leftover = combined; + }); + res.on("end", () => { + if (leftover.length > 0) onPcmChunk(leftover.toString("base64"), { sampleRate, channels }); + resolve(); + }); + res.on("error", reject); + }); + req.on("error", reject); + req.on("timeout", () => { req.destroy(); reject(new Error("XTTS Batch Timeout (60s)")); }); + req.write(body); + req.end(); + }); +} + // ── Voice Upload Handler ──────────────────────────── async function handleVoiceUpload(payload) {