feat: XTTS concatenates chunks into seamless WAV (no stuttering)

- All chunks rendered sequentially, PCM data concatenated - Single WAV with proper header sent back (no queue needed in app) - If total > 800KB, split into parts (WebSocket limit) - Eliminates stuttering between sentences Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 02:40:16 +02:00 · 2026-04-10 02:40:16 +02:00 · 2c785cb37a
parent 57e65b061c
commit 2c785cb37a
1 changed files with 95 additions and 17 deletions
--- a/xtts/bridge.js
+++ b/xtts/bridge.js
@ -127,21 +127,102 @@ async function handleTTSRequest(payload) {
    const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
    const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);

-    // Jeden Chunk sequentiell rendern und sofort senden
+    // Alle Chunks sequentiell rendern und PCM-Daten sammeln
+    const pcmBuffers = [];
+    let sampleRate = 0;
+    let channels = 0;
+    let bitsPerSample = 0;
+
    for (let i = 0; i < chunks.length; i++) {
      const chunk = chunks[i];
      try {
        const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null);

-        if (audioBuffer && audioBuffer.length > 100) {
-          const base64 = audioBuffer.toString("base64");
-          log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB) — "${chunk.slice(0, 50)}..."`);
+        if (audioBuffer && audioBuffer.length > 44) {
+          // WAV-Header parsen (erste 44 bytes) um PCM-Daten zu extrahieren
+          if (sampleRate === 0) {
+            channels = audioBuffer.readUInt16LE(22);
+            sampleRate = audioBuffer.readUInt32LE(24);
+            bitsPerSample = audioBuffer.readUInt16LE(34);
+          }
+          // PCM-Daten ab Byte 44
+          pcmBuffers.push(audioBuffer.slice(44));
+          log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes — "${chunk.slice(0, 40)}..."`);
+        }
+      } catch (chunkErr) {
+        log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
+      }
+    }
+
+    if (pcmBuffers.length === 0) {
+      log("TTS: Keine Audio-Daten erzeugt");
+      sendToRVS({
+        type: "xtts_response",
+        payload: { requestId, error: "Keine Audio-Daten" },
+        timestamp: Date.now(),
+      });
+      return;
+    }
+
+    // PCM-Daten zusammenfuegen und neuen WAV-Header schreiben
+    const allPcm = Buffer.concat(pcmBuffers);
+    const wavHeader = Buffer.alloc(44);
+    const byteRate = sampleRate * channels * (bitsPerSample / 8);
+    const blockAlign = channels * (bitsPerSample / 8);
+
+    wavHeader.write("RIFF", 0);
+    wavHeader.writeUInt32LE(36 + allPcm.length, 4);
+    wavHeader.write("WAVE", 8);
+    wavHeader.write("fmt ", 12);
+    wavHeader.writeUInt32LE(16, 16);           // Subchunk1Size
+    wavHeader.writeUInt16LE(1, 20);            // PCM format
+    wavHeader.writeUInt16LE(channels, 22);
+    wavHeader.writeUInt32LE(sampleRate, 24);
+    wavHeader.writeUInt32LE(byteRate, 28);
+    wavHeader.writeUInt16LE(blockAlign, 32);
+    wavHeader.writeUInt16LE(bitsPerSample, 34);
+    wavHeader.write("data", 36);
+    wavHeader.writeUInt32LE(allPcm.length, 40);
+
+    const completeWav = Buffer.concat([wavHeader, allPcm]);
+    const base64 = completeWav.toString("base64");
+
+    // Wenn zu gross (>800KB PCM) → in Teile splitten, sonst als Ganzes senden
+    const MAX_PCM_SIZE = 800 * 1024; // ~800KB PCM pro Nachricht
+    const pcmParts = [];
+    if (allPcm.length > MAX_PCM_SIZE) {
+      for (let offset = 0; offset < allPcm.length; offset += MAX_PCM_SIZE) {
+        pcmParts.push(allPcm.slice(offset, Math.min(offset + MAX_PCM_SIZE, allPcm.length)));
+      }
+    } else {
+      pcmParts.push(allPcm);
+    }
+
+    for (let p = 0; p < pcmParts.length; p++) {
+      const partPcm = pcmParts[p];
+      const partHeader = Buffer.alloc(44);
+      partHeader.write("RIFF", 0);
+      partHeader.writeUInt32LE(36 + partPcm.length, 4);
+      partHeader.write("WAVE", 8);
+      partHeader.write("fmt ", 12);
+      partHeader.writeUInt32LE(16, 16);
+      partHeader.writeUInt16LE(1, 20);
+      partHeader.writeUInt16LE(channels, 22);
+      partHeader.writeUInt32LE(sampleRate, 24);
+      partHeader.writeUInt32LE(byteRate, 28);
+      partHeader.writeUInt16LE(blockAlign, 32);
+      partHeader.writeUInt16LE(bitsPerSample, 34);
+      partHeader.write("data", 36);
+      partHeader.writeUInt32LE(partPcm.length, 40);
+
+      const partWav = Buffer.concat([partHeader, partPcm]);
+      const partBase64 = partWav.toString("base64");

      sendToRVS({
        type: "xtts_response",
        payload: {
-              requestId: `${requestId || ""}_${i}`,
-              base64,
+          requestId: `${requestId || ""}${pcmParts.length > 1 ? '_' + p : ''}`,
+          base64: partBase64,
          mimeType: "audio/wav",
          voice: voice || "default",
          engine: "xtts",
@ -149,12 +230,9 @@ async function handleTTSRequest(payload) {
        timestamp: Date.now(),
      });
    }
-      } catch (chunkErr) {
-        log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
-      }
-    }

-    log(`TTS komplett: ${chunks.length} Chunks gerendert`);
+    const totalSecs = (allPcm.length / byteRate).toFixed(1);
+    log(`TTS komplett: ${chunks.length} Chunks → ${pcmParts.length} Teil(e), ${(allPcm.length / 1024).toFixed(0)}KB, ${totalSecs}s`);
  } catch (err) {
    log(`TTS Fehler: ${err.message}`);
    sendToRVS({