fix: XTTS streaming mode - send each chunk immediately, comma between sentences

- Back to streaming: render chunk → send immediately → next chunk - App plays with preloading queue (no waiting for all chunks) - Comma instead of dot between sentences in chunk (no "Punkt" read aloud) - Sentence-ending dots already removed Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 02:48:50 +02:00 · 2026-04-10 02:48:50 +02:00 · f7f450a09d
parent 81f7c38383
commit f7f450a09d
1 changed files with 22 additions and 97 deletions
--- a/xtts/bridge.js
+++ b/xtts/bridge.js
@ -115,7 +115,7 @@ async function handleTTSRequest(payload) {
      chunks.push(currentChunk);
      currentChunk = sentence;
    } else {
-      currentChunk = currentChunk ? currentChunk + '. ' + sentence : sentence;
+      currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence;
    }
  }
  if (currentChunk) chunks.push(currentChunk);
@ -127,114 +127,39 @@ async function handleTTSRequest(payload) {
    const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
    const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
-    // Alle Chunks sequentiell rendern und PCM-Daten sammeln
+    // Streaming: Chunk rendern → sofort senden → naechster Chunk
-    const pcmBuffers = [];
+    // App spielt mit Preloading-Queue nahtlos ab
-    let sampleRate = 0;
+    let sentCount = 0;
    let channels = 0;
    let bitsPerSample = 0;
    for (let i = 0; i < chunks.length; i++) {
      const chunk = chunks[i];
      try {
        const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null);
-        if (audioBuffer && audioBuffer.length > 44) {
+        if (audioBuffer && audioBuffer.length > 100) {
-          // WAV-Header parsen (erste 44 bytes) um PCM-Daten zu extrahieren
+          log(`TTS [${i + 1}/${chunks.length}]: ${(audioBuffer.length / 1024).toFixed(0)}KB — "${chunk.slice(0, 50)}"`);
-          if (sampleRate === 0) {
+
-            channels = audioBuffer.readUInt16LE(22);
+          sendToRVS({
-            sampleRate = audioBuffer.readUInt32LE(24);
+            type: "xtts_response",
-            bitsPerSample = audioBuffer.readUInt16LE(34);
+            payload: {
-          }
+              requestId: `${requestId || ""}_${i}`,
-          // PCM-Daten ab Byte 44
+              base64: audioBuffer.toString("base64"),
-          pcmBuffers.push(audioBuffer.slice(44));
+              mimeType: "audio/wav",
-          log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes — "${chunk.slice(0, 40)}..."`);
+              voice: voice || "default",
              engine: "xtts",
              part: i + 1,
              totalParts: chunks.length,
            },
            timestamp: Date.now(),
          });
          sentCount++;
        }
      } catch (chunkErr) {
        log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
      }
    }
-    if (pcmBuffers.length === 0) {
+    log(`TTS komplett: ${sentCount}/${chunks.length} Chunks gestreamt`);
      log("TTS: Keine Audio-Daten erzeugt");
      sendToRVS({
        type: "xtts_response",
        payload: { requestId, error: "Keine Audio-Daten" },
        timestamp: Date.now(),
      });
      return;
    }
    // PCM-Daten zusammenfuegen und neuen WAV-Header schreiben
    const allPcm = Buffer.concat(pcmBuffers);
    const wavHeader = Buffer.alloc(44);
    const byteRate = sampleRate * channels * (bitsPerSample / 8);
    const blockAlign = channels * (bitsPerSample / 8);
    wavHeader.write("RIFF", 0);
    wavHeader.writeUInt32LE(36 + allPcm.length, 4);
    wavHeader.write("WAVE", 8);
    wavHeader.write("fmt ", 12);
    wavHeader.writeUInt32LE(16, 16);           // Subchunk1Size
    wavHeader.writeUInt16LE(1, 20);            // PCM format
    wavHeader.writeUInt16LE(channels, 22);
    wavHeader.writeUInt32LE(sampleRate, 24);
    wavHeader.writeUInt32LE(byteRate, 28);
    wavHeader.writeUInt16LE(blockAlign, 32);
    wavHeader.writeUInt16LE(bitsPerSample, 34);
    wavHeader.write("data", 36);
    wavHeader.writeUInt32LE(allPcm.length, 40);
    const completeWav = Buffer.concat([wavHeader, allPcm]);
    const base64 = completeWav.toString("base64");
    // In ~8 Sekunden Teile splitten (nahtlos genug fuer Queue, klein genug fuer WebSocket)
    const samplesPerSec = sampleRate * channels * (bitsPerSample / 8);
    const TARGET_SECS = 8; // ~8 Sekunden pro Teil
    const targetBytes = samplesPerSec * TARGET_SECS;
    const pcmParts = [];
    for (let offset = 0; offset < allPcm.length; offset += targetBytes) {
      pcmParts.push(allPcm.slice(offset, Math.min(offset + targetBytes, allPcm.length)));
    }
    function buildWav(pcmData) {
      const header = Buffer.alloc(44);
      header.write("RIFF", 0);
      header.writeUInt32LE(36 + pcmData.length, 4);
      header.write("WAVE", 8);
      header.write("fmt ", 12);
      header.writeUInt32LE(16, 16);
      header.writeUInt16LE(1, 20);
      header.writeUInt16LE(channels, 22);
      header.writeUInt32LE(sampleRate, 24);
      header.writeUInt32LE(byteRate, 28);
      header.writeUInt16LE(blockAlign, 32);
      header.writeUInt16LE(bitsPerSample, 34);
      header.write("data", 36);
      header.writeUInt32LE(pcmData.length, 40);
      return Buffer.concat([header, pcmData]);
    }
    for (let p = 0; p < pcmParts.length; p++) {
      const partWav = buildWav(pcmParts[p]);
      sendToRVS({
        type: "xtts_response",
        payload: {
          requestId: `${requestId || ""}${pcmParts.length > 1 ? '_' + p : ''}`,
          base64: partWav.toString("base64"),
          mimeType: "audio/wav",
          voice: voice || "default",
          engine: "xtts",
          part: p + 1,
          totalParts: pcmParts.length,
        },
        timestamp: Date.now(),
      });
    }
    const totalSecs = (allPcm.length / byteRate).toFixed(1);
    log(`TTS komplett: ${chunks.length} Chunks → ${pcmParts.length} Teil(e), ${(allPcm.length / 1024).toFixed(0)}KB, ${totalSecs}s`);
  } catch (err) {
    log(`TTS Fehler: ${err.message}`);
    sendToRVS({