From 350069d3714d6e0678f6ab8bcb146ad971321635 Mon Sep 17 00:00:00 2001
From: duffyduck <info@hacker-net.de>
Date: Sun, 19 Apr 2026 23:15:57 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20Streaming=20TTS=20=E2=80=94=20doppeltes?=
 =?UTF-8?q?=20Audio=20+=20Gaps=20zwischen=20Saetzen?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Zwei Probleme gefunden:

1) DOPPELTES AUDIO (Kern-Ursache der Artefakte)
   aria-bridge hat audio_pcm von XTTS-Bridge empfangen und per
   _send_to_rvs rebroadcastet. RVS broadcast geht an ALLE Clients
   ausser Sender — die App bekam jeden Chunk also zwei mal:
     XTTS-Bridge → RVS → App + aria-bridge
     aria-bridge → RVS → App (nochmal!) + XTTS-Bridge
   Zwei ueberlagerte PCM-Streams klingen wie Doubled/Artefakte.
   Fix: aria-bridge ignoriert audio_pcm jetzt. messageId schickt
   XTTS-Bridge selbst im Payload (via xtts_request -> messageId).

2) GAPS ZWISCHEN SAETZEN (abgehackt)
   xtts/bridge.js teilte Text in ~150-char Chunks und rief pro Chunk
   einen eigenen /tts_to_audio/ Request. Zwischen Chunks lag die
   XTTS-Render-Zeit (1-3s) → hoerbare Pausen.
   Fix: cleanText geht JETZT in einem Request komplett an XTTS.
   Ein zusammenhaengender Stream → keine Satz-Gaps mehr.
   Kompromiss: Erste Samples kommen spaeter (ganze Text-Render dauert
   laenger als der erste Satz alleine), aber dann kontinuierlich
   ohne Unterbrechung.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bridge/aria_bridge.py |  25 +++-------
 xtts/bridge.js        | 113 ++++++++++++++++--------------------------
 2 files changed, 48 insertions(+), 90 deletions(-)

diff --git a/bridge/aria_bridge.py b/bridge/aria_bridge.py
index a46146b..1d5655c 100644
--- a/bridge/aria_bridge.py
+++ b/bridge/aria_bridge.py
@@ -1100,25 +1100,12 @@ class ARIABridge:
             return
 
         elif msg_type == "audio_pcm":
-            # XTTS-PCM-Stream vom Gaming-PC empfangen → durchleiten zur App.
-            # Wenn in payload kein messageId (alte XTTS-Bridge), aus requestId auflösen.
-            error = payload.get("error", "")
-            if error:
-                logger.warning("[rvs] XTTS PCM-Fehler: %s", error)
-                return
-            linked_message_id = payload.get("messageId", "")
-            if not linked_message_id:
-                req_id_full = payload.get("requestId", "")
-                req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
-                linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
-            # Einfach 1:1 weiterleiten mit eingefuellter messageId
-            forwarded = dict(payload)
-            forwarded["messageId"] = linked_message_id
-            await self._send_to_rvs({
-                "type": "audio_pcm",
-                "payload": forwarded,
-                "timestamp": int(asyncio.get_event_loop().time() * 1000),
-            })
+            # Audio-PCM geht direkt von XTTS-Bridge an die App.
+            # Die aria-bridge darf es NICHT rebroadcasten — sonst bekommt die App
+            # jeden Chunk doppelt (einmal direkt von XTTS-Bridge via RVS-Broadcast,
+            # einmal indirekt via uns).
+            # Wir ignorieren diese Message hier einfach — messageId wird von
+            # XTTS-Bridge selbst im Payload mitgeliefert.
             return
 
         elif msg_type == "xtts_response":
diff --git a/xtts/bridge.js b/xtts/bridge.js
index b20cf82..f05a133 100644
--- a/xtts/bridge.js
+++ b/xtts/bridge.js
@@ -116,87 +116,58 @@ async function handleTTSRequest(payload) {
     .replace(/\(\)/g, "")
     .trim();
 
-  // Satzweise Chunks (XTTS Modell laedt Context pro Call — Saetze gruppieren)
-  const sentences = cleanText.split(/(?<=[.!?])\s+/)
-    .map(s => s.trim())
-    .filter(s => s.length > 0)
-    .map(s => s.replace(/[.]+$/, ''));
-
-  const MAX_CHUNK_CHARS = 150;
-  const chunks = [];
-  let currentChunk = '';
-  for (const sentence of sentences) {
-    if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) {
-      chunks.push(currentChunk);
-      currentChunk = sentence;
-    } else {
-      currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence;
-    }
-  }
-  if (currentChunk) chunks.push(currentChunk);
-  if (chunks.length === 0) return;
-
-  log(`TTS-Request (streaming): "${cleanText.slice(0, 60)}..." (${chunks.length} Chunks, voice: ${voice || "default"})`);
+  log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);
 
   try {
     const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
     const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
 
     let chunkIndex = 0;
-    // Audio-Format (aus WAV-Header extrahiert, einmal pro Request)
     let pcmMeta = null;
 
-    for (let i = 0; i < chunks.length; i++) {
-      const chunk = chunks[i];
-      const isLastChunk = i === chunks.length - 1;
-      try {
-        // Streaming: PCM-Frames werden nacheinander an RVS gepusht,
-        // sobald sie vom XTTS-Server reinkommen
-        await streamXTTSAsPCM(
-          chunk,
-          language || "de",
-          hasCustomVoice ? voiceSample : null,
-          (pcmBase64, meta) => {
-            if (!pcmMeta) pcmMeta = meta;
-            sendToRVS({
-              type: "audio_pcm",
-              payload: {
-                requestId: requestId || "",
-                messageId: messageId || "",
-                base64: pcmBase64,
-                format: "pcm_s16le",
-                sampleRate: meta.sampleRate,
-                channels: meta.channels,
-                voice: voice || "default",
-                chunk: chunkIndex++,
-                final: false,
-              },
-              timestamp: Date.now(),
-            });
+    // EIN Request fuer den GANZEN Text — kein Gap zwischen Saetzen.
+    // XTTS rendert und wir streamen PCM sobald es reinkommt.
+    await streamXTTSAsPCM(
+      cleanText,
+      language || "de",
+      hasCustomVoice ? voiceSample : null,
+      (pcmBase64, meta) => {
+        if (!pcmMeta) pcmMeta = meta;
+        sendToRVS({
+          type: "audio_pcm",
+          payload: {
+            requestId: requestId || "",
+            messageId: messageId || "",
+            base64: pcmBase64,
+            format: "pcm_s16le",
+            sampleRate: meta.sampleRate,
+            channels: meta.channels,
+            voice: voice || "default",
+            chunk: chunkIndex++,
+            final: false,
           },
-        );
+          timestamp: Date.now(),
+        });
+      },
+    );
 
-        // Nach letztem Text-Chunk: final-Flag senden damit App weiss "fertig"
-        if (isLastChunk && pcmMeta) {
-          sendToRVS({
-            type: "audio_pcm",
-            payload: {
-              requestId: requestId || "",
-              messageId: messageId || "",
-              base64: "",
-              format: "pcm_s16le",
-              sampleRate: pcmMeta.sampleRate,
-              channels: pcmMeta.channels,
-              voice: voice || "default",
-              chunk: chunkIndex++,
-              final: true,
-            },
-            timestamp: Date.now(),
-          });
-        }
-      } catch (chunkErr) {
-        log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
-      }
+    // Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
+    if (pcmMeta) {
+      sendToRVS({
+        type: "audio_pcm",
+        payload: {
+          requestId: requestId || "",
+          messageId: messageId || "",
+          base64: "",
+          format: "pcm_s16le",
+          sampleRate: pcmMeta.sampleRate,
+          channels: pcmMeta.channels,
+          voice: voice || "default",
+          chunk: chunkIndex++,
+          final: true,
+        },
+        timestamp: Date.now(),
+      });
     }
 
     log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);