From f7f450a09d86098e129f84a49195b0496722d19c Mon Sep 17 00:00:00 2001
From: duffyduck <info@hacker-net.de>
Date: Fri, 10 Apr 2026 02:48:50 +0200
Subject: [PATCH] fix: XTTS streaming mode - send each chunk immediately, comma
 between sentences
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Back to streaming: render chunk → send immediately → next chunk
- App plays with preloading queue (no waiting for all chunks)
- Comma instead of dot between sentences in chunk (no "Punkt" read aloud)
- Sentence-ending dots already removed

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 xtts/bridge.js | 119 +++++++++----------------------------------------
 1 file changed, 22 insertions(+), 97 deletions(-)

diff --git a/xtts/bridge.js b/xtts/bridge.js
index 5b7d673..84fb20b 100644
--- a/xtts/bridge.js
+++ b/xtts/bridge.js
@@ -115,7 +115,7 @@ async function handleTTSRequest(payload) {
       chunks.push(currentChunk);
       currentChunk = sentence;
     } else {
-      currentChunk = currentChunk ? currentChunk + '. ' + sentence : sentence;
+      currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence;
     }
   }
   if (currentChunk) chunks.push(currentChunk);
@@ -127,114 +127,39 @@ async function handleTTSRequest(payload) {
     const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
     const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
 
-    // Alle Chunks sequentiell rendern und PCM-Daten sammeln
-    const pcmBuffers = [];
-    let sampleRate = 0;
-    let channels = 0;
-    let bitsPerSample = 0;
+    // Streaming: Chunk rendern → sofort senden → naechster Chunk
+    // App spielt mit Preloading-Queue nahtlos ab
+    let sentCount = 0;
 
     for (let i = 0; i < chunks.length; i++) {
       const chunk = chunks[i];
       try {
         const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null);
 
-        if (audioBuffer && audioBuffer.length > 44) {
-          // WAV-Header parsen (erste 44 bytes) um PCM-Daten zu extrahieren
-          if (sampleRate === 0) {
-            channels = audioBuffer.readUInt16LE(22);
-            sampleRate = audioBuffer.readUInt32LE(24);
-            bitsPerSample = audioBuffer.readUInt16LE(34);
-          }
-          // PCM-Daten ab Byte 44
-          pcmBuffers.push(audioBuffer.slice(44));
-          log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes — "${chunk.slice(0, 40)}..."`);
+        if (audioBuffer && audioBuffer.length > 100) {
+          log(`TTS [${i + 1}/${chunks.length}]: ${(audioBuffer.length / 1024).toFixed(0)}KB — "${chunk.slice(0, 50)}"`);
+
+          sendToRVS({
+            type: "xtts_response",
+            payload: {
+              requestId: `${requestId || ""}_${i}`,
+              base64: audioBuffer.toString("base64"),
+              mimeType: "audio/wav",
+              voice: voice || "default",
+              engine: "xtts",
+              part: i + 1,
+              totalParts: chunks.length,
+            },
+            timestamp: Date.now(),
+          });
+          sentCount++;
         }
       } catch (chunkErr) {
         log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
       }
     }
 
-    if (pcmBuffers.length === 0) {
-      log("TTS: Keine Audio-Daten erzeugt");
-      sendToRVS({
-        type: "xtts_response",
-        payload: { requestId, error: "Keine Audio-Daten" },
-        timestamp: Date.now(),
-      });
-      return;
-    }
-
-    // PCM-Daten zusammenfuegen und neuen WAV-Header schreiben
-    const allPcm = Buffer.concat(pcmBuffers);
-    const wavHeader = Buffer.alloc(44);
-    const byteRate = sampleRate * channels * (bitsPerSample / 8);
-    const blockAlign = channels * (bitsPerSample / 8);
-
-    wavHeader.write("RIFF", 0);
-    wavHeader.writeUInt32LE(36 + allPcm.length, 4);
-    wavHeader.write("WAVE", 8);
-    wavHeader.write("fmt ", 12);
-    wavHeader.writeUInt32LE(16, 16);           // Subchunk1Size
-    wavHeader.writeUInt16LE(1, 20);            // PCM format
-    wavHeader.writeUInt16LE(channels, 22);
-    wavHeader.writeUInt32LE(sampleRate, 24);
-    wavHeader.writeUInt32LE(byteRate, 28);
-    wavHeader.writeUInt16LE(blockAlign, 32);
-    wavHeader.writeUInt16LE(bitsPerSample, 34);
-    wavHeader.write("data", 36);
-    wavHeader.writeUInt32LE(allPcm.length, 40);
-
-    const completeWav = Buffer.concat([wavHeader, allPcm]);
-    const base64 = completeWav.toString("base64");
-
-    // In ~8 Sekunden Teile splitten (nahtlos genug fuer Queue, klein genug fuer WebSocket)
-    const samplesPerSec = sampleRate * channels * (bitsPerSample / 8);
-    const TARGET_SECS = 8; // ~8 Sekunden pro Teil
-    const targetBytes = samplesPerSec * TARGET_SECS;
-
-    const pcmParts = [];
-    for (let offset = 0; offset < allPcm.length; offset += targetBytes) {
-      pcmParts.push(allPcm.slice(offset, Math.min(offset + targetBytes, allPcm.length)));
-    }
-
-    function buildWav(pcmData) {
-      const header = Buffer.alloc(44);
-      header.write("RIFF", 0);
-      header.writeUInt32LE(36 + pcmData.length, 4);
-      header.write("WAVE", 8);
-      header.write("fmt ", 12);
-      header.writeUInt32LE(16, 16);
-      header.writeUInt16LE(1, 20);
-      header.writeUInt16LE(channels, 22);
-      header.writeUInt32LE(sampleRate, 24);
-      header.writeUInt32LE(byteRate, 28);
-      header.writeUInt16LE(blockAlign, 32);
-      header.writeUInt16LE(bitsPerSample, 34);
-      header.write("data", 36);
-      header.writeUInt32LE(pcmData.length, 40);
-      return Buffer.concat([header, pcmData]);
-    }
-
-    for (let p = 0; p < pcmParts.length; p++) {
-      const partWav = buildWav(pcmParts[p]);
-
-      sendToRVS({
-        type: "xtts_response",
-        payload: {
-          requestId: `${requestId || ""}${pcmParts.length > 1 ? '_' + p : ''}`,
-          base64: partWav.toString("base64"),
-          mimeType: "audio/wav",
-          voice: voice || "default",
-          engine: "xtts",
-          part: p + 1,
-          totalParts: pcmParts.length,
-        },
-        timestamp: Date.now(),
-      });
-    }
-
-    const totalSecs = (allPcm.length / byteRate).toFixed(1);
-    log(`TTS komplett: ${chunks.length} Chunks → ${pcmParts.length} Teil(e), ${(allPcm.length / 1024).toFixed(0)}KB, ${totalSecs}s`);
+    log(`TTS komplett: ${sentCount}/${chunks.length} Chunks gestreamt`);
   } catch (err) {
     log(`TTS Fehler: ${err.message}`);
     sendToRVS({