From b3d3b8b6bc353662782bd38dda4a3162ae7978cb Mon Sep 17 00:00:00 2001
From: duffyduck <info@hacker-net.de>
Date: Fri, 10 Apr 2026 02:03:29 +0200
Subject: [PATCH] fix: XTTS bridge splits text into sentences sequentially

- XTTS-Bridge does sentence splitting (not ARIA-Bridge)
- Sequential rendering: correct order guaranteed
- Each sentence sent as separate xtts_response
- Markdown removal before splitting
- App starts playback after first sentence (faster UX)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 bridge/aria_bridge.py | 31 +++++++++--------------
 xtts/bridge.js        | 58 ++++++++++++++++++++++++-------------------
 2 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/bridge/aria_bridge.py b/bridge/aria_bridge.py
index 789effc..3e1331e 100644
--- a/bridge/aria_bridge.py
+++ b/bridge/aria_bridge.py
@@ -851,27 +851,20 @@ class ARIABridge:
             tts_engine = getattr(self, 'tts_engine_type', 'piper')
 
             if tts_engine == "xtts":
-                # XTTS: Lange Texte satzweise aufteilen (WebSocket-Limit + schnellere Wiedergabe)
-                import re as _re
+                # XTTS: Ganzen Text senden, XTTS-Bridge teilt satzweise auf
                 xtts_voice = getattr(self, 'xtts_voice', '')
-                clean_text = _re.sub(r'\*\*([^*]+)\*\*', r'\1', text).strip()
-                sentences = _re.split(r'(?<=[.!?])\s+', clean_text)
-                sentences = [s.strip() for s in sentences if s.strip()]
-                if not sentences:
-                    sentences = [clean_text]
                 try:
-                    for sentence in sentences:
-                        await self._send_to_rvs({
-                            "type": "xtts_request",
-                            "payload": {
-                                "text": sentence,
-                                "voice": xtts_voice,
-                                "language": "de",
-                                "requestId": str(uuid.uuid4()),
-                            },
-                            "timestamp": int(asyncio.get_event_loop().time() * 1000),
-                        })
-                    logger.info("[core] XTTS-Request gesendet (%s, %d Saetze): '%s'", xtts_voice or "default", len(sentences), text[:60])
+                    await self._send_to_rvs({
+                        "type": "xtts_request",
+                        "payload": {
+                            "text": text,
+                            "voice": xtts_voice,
+                            "language": "de",
+                            "requestId": str(uuid.uuid4()),
+                        },
+                        "timestamp": int(asyncio.get_event_loop().time() * 1000),
+                    })
+                    logger.info("[core] XTTS-Request gesendet (%s): '%s'", xtts_voice or "default", text[:60])
                 except Exception as e:
                     logger.warning("[core] XTTS-Request fehlgeschlagen: %s — Fallback auf Piper", e)
                     # Fallback auf Piper
diff --git a/xtts/bridge.js b/xtts/bridge.js
index 12d271b..7a1e04d 100644
--- a/xtts/bridge.js
+++ b/xtts/bridge.js
@@ -97,39 +97,47 @@ async function handleTTSRequest(payload) {
   const { text, voice, requestId, language } = payload;
   if (!text) return;
 
-  log(`TTS-Request: "${text.slice(0, 60)}..." (voice: ${voice || "default"}, lang: ${language || "de"})`);
+  // Markdown entfernen
+  const cleanText = text.replace(/\*\*([^*]+)\*\*/g, "$1").trim();
+
+  // Text in Saetze aufteilen (sequentiell rendern fuer korrekte Reihenfolge)
+  const sentences = cleanText.split(/(?<=[.!?])\s+/).map(s => s.trim()).filter(s => s.length > 0);
+  if (sentences.length === 0) return;
+
+  log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze, voice: ${voice || "default"}, lang: ${language || "de"})`);
 
   try {
-    // Voice-Sample Pfad bestimmen
     const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
     const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
 
-    // XTTS API aufrufen
-    const audioBuffer = await callXTTSAPI(text, language || "de", hasCustomVoice ? voiceSample : null);
+    // Jeden Satz sequentiell rendern und sofort senden
+    for (let i = 0; i < sentences.length; i++) {
+      const sentence = sentences[i];
+      try {
+        const audioBuffer = await callXTTSAPI(sentence, language || "de", hasCustomVoice ? voiceSample : null);
 
-    if (audioBuffer && audioBuffer.length > 100) {
-      const base64 = audioBuffer.toString("base64");
-      log(`TTS fertig: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB)`);
+        if (audioBuffer && audioBuffer.length > 100) {
+          const base64 = audioBuffer.toString("base64");
+          log(`TTS [${i + 1}/${sentences.length}]: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB) — "${sentence.slice(0, 40)}..."`);
 
-      sendToRVS({
-        type: "xtts_response",
-        payload: {
-          requestId: requestId || "",
-          base64,
-          mimeType: "audio/wav",
-          voice: voice || "default",
-          engine: "xtts",
-        },
-        timestamp: Date.now(),
-      });
-    } else {
-      log("TTS: Leeres Audio erhalten");
-      sendToRVS({
-        type: "xtts_response",
-        payload: { requestId, error: "Leeres Audio" },
-        timestamp: Date.now(),
-      });
+          sendToRVS({
+            type: "xtts_response",
+            payload: {
+              requestId: `${requestId || ""}_${i}`,
+              base64,
+              mimeType: "audio/wav",
+              voice: voice || "default",
+              engine: "xtts",
+            },
+            timestamp: Date.now(),
+          });
+        }
+      } catch (sentenceErr) {
+        log(`TTS [${i + 1}/${sentences.length}] Fehler: ${sentenceErr.message} — ueberspringe`);
+      }
     }
+
+    log(`TTS komplett: ${sentences.length} Saetze gerendert`);
   } catch (err) {
     log(`TTS Fehler: ${err.message}`);
     sendToRVS({