From 8929bc99bbf39eea1edd058ea13c3348279d988e Mon Sep 17 00:00:00 2001
From: duffyduck <info@hacker-net.de>
Date: Fri, 10 Apr 2026 02:23:29 +0200
Subject: [PATCH] fix: XTTS groups sentences into ~250 char chunks for
 consistent voice quality

- 2-3 sentences per chunk (more context = stable voice/volume)
- Max 250 chars per chunk (keeps WebSocket packets manageable)
- Dots re-added between sentences within a chunk (natural pauses)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 xtts/bridge.js | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/xtts/bridge.js b/xtts/bridge.js
index 0300474..e1c5464 100644
--- a/xtts/bridge.js
+++ b/xtts/bridge.js
@@ -100,28 +100,42 @@ async function handleTTSRequest(payload) {
   // Markdown entfernen
   const cleanText = text.replace(/\*\*([^*]+)\*\*/g, "$1").trim();
 
-  // Text in Saetze aufteilen (sequentiell rendern fuer korrekte Reihenfolge)
+  // Text in Saetze aufteilen, dann zu Chunks von 2-3 Saetzen zusammenfassen
+  // (mehr Kontext = konsistentere Stimme/Lautstaerke, aber nicht zu lang fuer WebSocket)
   const sentences = cleanText.split(/(?<=[.!?])\s+/)
     .map(s => s.trim())
     .filter(s => s.length > 0)
-    .map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen (XTTS liest ihn sonst vor)
-  if (sentences.length === 0) return;
+    .map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen
 
-  log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze, voice: ${voice || "default"}, lang: ${language || "de"})`);
+  const MAX_CHUNK_CHARS = 250; // Max ~250 Zeichen pro Chunk
+  const chunks = [];
+  let currentChunk = '';
+  for (const sentence of sentences) {
+    if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) {
+      chunks.push(currentChunk);
+      currentChunk = sentence;
+    } else {
+      currentChunk = currentChunk ? currentChunk + '. ' + sentence : sentence;
+    }
+  }
+  if (currentChunk) chunks.push(currentChunk);
+  if (chunks.length === 0) return;
+
+  log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze → ${chunks.length} Chunks, voice: ${voice || "default"}, lang: ${language || "de"})`);
 
   try {
     const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
     const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
 
-    // Jeden Satz sequentiell rendern und sofort senden
-    for (let i = 0; i < sentences.length; i++) {
-      const sentence = sentences[i];
+    // Jeden Chunk sequentiell rendern und sofort senden
+    for (let i = 0; i < chunks.length; i++) {
+      const chunk = chunks[i];
       try {
-        const audioBuffer = await callXTTSAPI(sentence, language || "de", hasCustomVoice ? voiceSample : null);
+        const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null);
 
         if (audioBuffer && audioBuffer.length > 100) {
           const base64 = audioBuffer.toString("base64");
-          log(`TTS [${i + 1}/${sentences.length}]: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB) — "${sentence.slice(0, 40)}..."`);
+          log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB) — "${chunk.slice(0, 50)}..."`);
 
           sendToRVS({
             type: "xtts_response",
@@ -135,12 +149,12 @@ async function handleTTSRequest(payload) {
             timestamp: Date.now(),
           });
         }
-      } catch (sentenceErr) {
-        log(`TTS [${i + 1}/${sentences.length}] Fehler: ${sentenceErr.message} — ueberspringe`);
+      } catch (chunkErr) {
+        log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
       }
     }
 
-    log(`TTS komplett: ${sentences.length} Saetze gerendert`);
+    log(`TTS komplett: ${chunks.length} Chunks gerendert`);
   } catch (err) {
     log(`TTS Fehler: ${err.message}`);
     sendToRVS({