fix: Streaming TTS — doppeltes Audio + Gaps zwischen Saetzen

Zwei Probleme gefunden: 1) DOPPELTES AUDIO (Kern-Ursache der Artefakte) aria-bridge hat audio_pcm von XTTS-Bridge empfangen und per _send_to_rvs rebroadcastet. RVS broadcast geht an ALLE Clients ausser Sender — die App bekam jeden Chunk also zwei mal: XTTS-Bridge → RVS → App + aria-bridge aria-bridge → RVS → App (nochmal!) + XTTS-Bridge Zwei ueberlagerte PCM-Streams klingen wie Doubled/Artefakte. Fix: aria-bridge ignoriert audio_pcm jetzt. messageId schickt XTTS-Bridge selbst im Payload (via xtts_request -> messageId). 2) GAPS ZWISCHEN SAETZEN (abgehackt) xtts/bridge.js teilte Text in ~150-char Chunks und rief pro Chunk einen eigenen /tts_to_audio/ Request. Zwischen Chunks lag die XTTS-Render-Zeit (1-3s) → hoerbare Pausen. Fix: cleanText geht JETZT in einem Request komplett an XTTS. Ein zusammenhaengender Stream → keine Satz-Gaps mehr. Kompromiss: Erste Samples kommen spaeter (ganze Text-Render dauert laenger als der erste Satz alleine), aber dann kontinuierlich ohne Unterbrechung. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 23:15:57 +02:00
parent 019c078393
commit 350069d371
2 changed files with 48 additions and 90 deletions
@@ -1100,25 +1100,12 @@ class ARIABridge:
            return

        elif msg_type == "audio_pcm":
-            # XTTS-PCM-Stream vom Gaming-PC empfangen → durchleiten zur App.
-            # Wenn in payload kein messageId (alte XTTS-Bridge), aus requestId auflösen.
-            error = payload.get("error", "")
-            if error:
-                logger.warning("[rvs] XTTS PCM-Fehler: %s", error)
-                return
-            linked_message_id = payload.get("messageId", "")
-            if not linked_message_id:
-                req_id_full = payload.get("requestId", "")
-                req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
-                linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
-            # Einfach 1:1 weiterleiten mit eingefuellter messageId
-            forwarded = dict(payload)
-            forwarded["messageId"] = linked_message_id
-            await self._send_to_rvs({
-                "type": "audio_pcm",
-                "payload": forwarded,
-                "timestamp": int(asyncio.get_event_loop().time() * 1000),
-            })
+            # Audio-PCM geht direkt von XTTS-Bridge an die App.
+            # Die aria-bridge darf es NICHT rebroadcasten — sonst bekommt die App
+            # jeden Chunk doppelt (einmal direkt von XTTS-Bridge via RVS-Broadcast,
+            # einmal indirekt via uns).
+            # Wir ignorieren diese Message hier einfach — messageId wird von
+            # XTTS-Bridge selbst im Payload mitgeliefert.
            return

        elif msg_type == "xtts_response":
@@ -116,87 +116,58 @@ async function handleTTSRequest(payload) {
    .replace(/\(\)/g, "")
    .trim();

-  // Satzweise Chunks (XTTS Modell laedt Context pro Call — Saetze gruppieren)
-  const sentences = cleanText.split(/(?<=[.!?])\s+/)
-    .map(s => s.trim())
-    .filter(s => s.length > 0)
-    .map(s => s.replace(/[.]+$/, ''));
-
-  const MAX_CHUNK_CHARS = 150;
-  const chunks = [];
-  let currentChunk = '';
-  for (const sentence of sentences) {
-    if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) {
-      chunks.push(currentChunk);
-      currentChunk = sentence;
-    } else {
-      currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence;
-    }
-  }
-  if (currentChunk) chunks.push(currentChunk);
-  if (chunks.length === 0) return;
-
-  log(`TTS-Request (streaming): "${cleanText.slice(0, 60)}..." (${chunks.length} Chunks, voice: ${voice || "default"})`);
+  log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);

  try {
    const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
    const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);

    let chunkIndex = 0;
-    // Audio-Format (aus WAV-Header extrahiert, einmal pro Request)
    let pcmMeta = null;

-    for (let i = 0; i < chunks.length; i++) {
-      const chunk = chunks[i];
-      const isLastChunk = i === chunks.length - 1;
-      try {
-        // Streaming: PCM-Frames werden nacheinander an RVS gepusht,
-        // sobald sie vom XTTS-Server reinkommen
-        await streamXTTSAsPCM(
-          chunk,
-          language || "de",
-          hasCustomVoice ? voiceSample : null,
-          (pcmBase64, meta) => {
-            if (!pcmMeta) pcmMeta = meta;
-            sendToRVS({
-              type: "audio_pcm",
-              payload: {
-                requestId: requestId || "",
-                messageId: messageId || "",
-                base64: pcmBase64,
-                format: "pcm_s16le",
-                sampleRate: meta.sampleRate,
-                channels: meta.channels,
-                voice: voice || "default",
-                chunk: chunkIndex++,
-                final: false,
-              },
-              timestamp: Date.now(),
-            });
+    // EIN Request fuer den GANZEN Text — kein Gap zwischen Saetzen.
+    // XTTS rendert und wir streamen PCM sobald es reinkommt.
+    await streamXTTSAsPCM(
+      cleanText,
+      language || "de",
+      hasCustomVoice ? voiceSample : null,
+      (pcmBase64, meta) => {
+        if (!pcmMeta) pcmMeta = meta;
+        sendToRVS({
+          type: "audio_pcm",
+          payload: {
+            requestId: requestId || "",
+            messageId: messageId || "",
+            base64: pcmBase64,
+            format: "pcm_s16le",
+            sampleRate: meta.sampleRate,
+            channels: meta.channels,
+            voice: voice || "default",
+            chunk: chunkIndex++,
+            final: false,
          },
-        );
+          timestamp: Date.now(),
+        });
+      },
+    );

-        // Nach letztem Text-Chunk: final-Flag senden damit App weiss "fertig"
-        if (isLastChunk && pcmMeta) {
-          sendToRVS({
-            type: "audio_pcm",
-            payload: {
-              requestId: requestId || "",
-              messageId: messageId || "",
-              base64: "",
-              format: "pcm_s16le",
-              sampleRate: pcmMeta.sampleRate,
-              channels: pcmMeta.channels,
-              voice: voice || "default",
-              chunk: chunkIndex++,
-              final: true,
-            },
-            timestamp: Date.now(),
-          });
-        }
-      } catch (chunkErr) {
-        log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
-      }
+    // Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
+    if (pcmMeta) {
+      sendToRVS({
+        type: "audio_pcm",
+        payload: {
+          requestId: requestId || "",
+          messageId: messageId || "",
+          base64: "",
+          format: "pcm_s16le",
+          sampleRate: pcmMeta.sampleRate,
+          channels: pcmMeta.channels,
+          voice: voice || "default",
+          chunk: chunkIndex++,
+          final: true,
+        },
+        timestamp: Date.now(),
+      });
    }

    log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);