From 350069d3714d6e0678f6ab8bcb146ad971321635 Mon Sep 17 00:00:00 2001 From: duffyduck Date: Sun, 19 Apr 2026 23:15:57 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20Streaming=20TTS=20=E2=80=94=20doppeltes?= =?UTF-8?q?=20Audio=20+=20Gaps=20zwischen=20Saetzen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zwei Probleme gefunden: 1) DOPPELTES AUDIO (Kern-Ursache der Artefakte) aria-bridge hat audio_pcm von XTTS-Bridge empfangen und per _send_to_rvs rebroadcastet. RVS broadcast geht an ALLE Clients ausser Sender — die App bekam jeden Chunk also zwei mal: XTTS-Bridge → RVS → App + aria-bridge aria-bridge → RVS → App (nochmal!) + XTTS-Bridge Zwei ueberlagerte PCM-Streams klingen wie Doubled/Artefakte. Fix: aria-bridge ignoriert audio_pcm jetzt. messageId schickt XTTS-Bridge selbst im Payload (via xtts_request -> messageId). 2) GAPS ZWISCHEN SAETZEN (abgehackt) xtts/bridge.js teilte Text in ~150-char Chunks und rief pro Chunk einen eigenen /tts_to_audio/ Request. Zwischen Chunks lag die XTTS-Render-Zeit (1-3s) → hoerbare Pausen. Fix: cleanText geht JETZT in einem Request komplett an XTTS. Ein zusammenhaengender Stream → keine Satz-Gaps mehr. Kompromiss: Erste Samples kommen spaeter (ganze Text-Render dauert laenger als der erste Satz alleine), aber dann kontinuierlich ohne Unterbrechung. Co-Authored-By: Claude Opus 4.7 (1M context) --- bridge/aria_bridge.py | 25 +++------- xtts/bridge.js | 113 ++++++++++++++++-------------------------- 2 files changed, 48 insertions(+), 90 deletions(-) diff --git a/bridge/aria_bridge.py b/bridge/aria_bridge.py index a46146b..1d5655c 100644 --- a/bridge/aria_bridge.py +++ b/bridge/aria_bridge.py @@ -1100,25 +1100,12 @@ class ARIABridge: return elif msg_type == "audio_pcm": - # XTTS-PCM-Stream vom Gaming-PC empfangen → durchleiten zur App. - # Wenn in payload kein messageId (alte XTTS-Bridge), aus requestId auflösen. - error = payload.get("error", "") - if error: - logger.warning("[rvs] XTTS PCM-Fehler: %s", error) - return - linked_message_id = payload.get("messageId", "") - if not linked_message_id: - req_id_full = payload.get("requestId", "") - req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full - linked_message_id = self._xtts_request_to_message.get(req_id_base, "") - # Einfach 1:1 weiterleiten mit eingefuellter messageId - forwarded = dict(payload) - forwarded["messageId"] = linked_message_id - await self._send_to_rvs({ - "type": "audio_pcm", - "payload": forwarded, - "timestamp": int(asyncio.get_event_loop().time() * 1000), - }) + # Audio-PCM geht direkt von XTTS-Bridge an die App. + # Die aria-bridge darf es NICHT rebroadcasten — sonst bekommt die App + # jeden Chunk doppelt (einmal direkt von XTTS-Bridge via RVS-Broadcast, + # einmal indirekt via uns). + # Wir ignorieren diese Message hier einfach — messageId wird von + # XTTS-Bridge selbst im Payload mitgeliefert. return elif msg_type == "xtts_response": diff --git a/xtts/bridge.js b/xtts/bridge.js index b20cf82..f05a133 100644 --- a/xtts/bridge.js +++ b/xtts/bridge.js @@ -116,87 +116,58 @@ async function handleTTSRequest(payload) { .replace(/\(\)/g, "") .trim(); - // Satzweise Chunks (XTTS Modell laedt Context pro Call — Saetze gruppieren) - const sentences = cleanText.split(/(?<=[.!?])\s+/) - .map(s => s.trim()) - .filter(s => s.length > 0) - .map(s => s.replace(/[.]+$/, '')); - - const MAX_CHUNK_CHARS = 150; - const chunks = []; - let currentChunk = ''; - for (const sentence of sentences) { - if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) { - chunks.push(currentChunk); - currentChunk = sentence; - } else { - currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence; - } - } - if (currentChunk) chunks.push(currentChunk); - if (chunks.length === 0) return; - - log(`TTS-Request (streaming): "${cleanText.slice(0, 60)}..." (${chunks.length} Chunks, voice: ${voice || "default"})`); + log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`); try { const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null; const hasCustomVoice = voiceSample && fs.existsSync(voiceSample); let chunkIndex = 0; - // Audio-Format (aus WAV-Header extrahiert, einmal pro Request) let pcmMeta = null; - for (let i = 0; i < chunks.length; i++) { - const chunk = chunks[i]; - const isLastChunk = i === chunks.length - 1; - try { - // Streaming: PCM-Frames werden nacheinander an RVS gepusht, - // sobald sie vom XTTS-Server reinkommen - await streamXTTSAsPCM( - chunk, - language || "de", - hasCustomVoice ? voiceSample : null, - (pcmBase64, meta) => { - if (!pcmMeta) pcmMeta = meta; - sendToRVS({ - type: "audio_pcm", - payload: { - requestId: requestId || "", - messageId: messageId || "", - base64: pcmBase64, - format: "pcm_s16le", - sampleRate: meta.sampleRate, - channels: meta.channels, - voice: voice || "default", - chunk: chunkIndex++, - final: false, - }, - timestamp: Date.now(), - }); + // EIN Request fuer den GANZEN Text — kein Gap zwischen Saetzen. + // XTTS rendert und wir streamen PCM sobald es reinkommt. + await streamXTTSAsPCM( + cleanText, + language || "de", + hasCustomVoice ? voiceSample : null, + (pcmBase64, meta) => { + if (!pcmMeta) pcmMeta = meta; + sendToRVS({ + type: "audio_pcm", + payload: { + requestId: requestId || "", + messageId: messageId || "", + base64: pcmBase64, + format: "pcm_s16le", + sampleRate: meta.sampleRate, + channels: meta.channels, + voice: voice || "default", + chunk: chunkIndex++, + final: false, }, - ); + timestamp: Date.now(), + }); + }, + ); - // Nach letztem Text-Chunk: final-Flag senden damit App weiss "fertig" - if (isLastChunk && pcmMeta) { - sendToRVS({ - type: "audio_pcm", - payload: { - requestId: requestId || "", - messageId: messageId || "", - base64: "", - format: "pcm_s16le", - sampleRate: pcmMeta.sampleRate, - channels: pcmMeta.channels, - voice: voice || "default", - chunk: chunkIndex++, - final: true, - }, - timestamp: Date.now(), - }); - } - } catch (chunkErr) { - log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`); - } + // Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann + if (pcmMeta) { + sendToRVS({ + type: "audio_pcm", + payload: { + requestId: requestId || "", + messageId: messageId || "", + base64: "", + format: "pcm_s16le", + sampleRate: pcmMeta.sampleRate, + channels: pcmMeta.channels, + voice: voice || "default", + chunk: chunkIndex++, + final: true, + }, + timestamp: Date.now(), + }); } log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);