From 2c785cb37a7b4bd1adf57a111fd56ccdcbe8bd39 Mon Sep 17 00:00:00 2001 From: duffyduck Date: Fri, 10 Apr 2026 02:40:16 +0200 Subject: [PATCH] feat: XTTS concatenates chunks into seamless WAV (no stuttering) - All chunks rendered sequentially, PCM data concatenated - Single WAV with proper header sent back (no queue needed in app) - If total > 800KB, split into parts (WebSocket limit) - Eliminates stuttering between sentences Co-Authored-By: Claude Opus 4.6 (1M context) --- xtts/bridge.js | 112 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 95 insertions(+), 17 deletions(-) diff --git a/xtts/bridge.js b/xtts/bridge.js index e1c5464..1a07e3f 100644 --- a/xtts/bridge.js +++ b/xtts/bridge.js @@ -127,34 +127,112 @@ async function handleTTSRequest(payload) { const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null; const hasCustomVoice = voiceSample && fs.existsSync(voiceSample); - // Jeden Chunk sequentiell rendern und sofort senden + // Alle Chunks sequentiell rendern und PCM-Daten sammeln + const pcmBuffers = []; + let sampleRate = 0; + let channels = 0; + let bitsPerSample = 0; + for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; try { const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null); - if (audioBuffer && audioBuffer.length > 100) { - const base64 = audioBuffer.toString("base64"); - log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB) — "${chunk.slice(0, 50)}..."`); - - sendToRVS({ - type: "xtts_response", - payload: { - requestId: `${requestId || ""}_${i}`, - base64, - mimeType: "audio/wav", - voice: voice || "default", - engine: "xtts", - }, - timestamp: Date.now(), - }); + if (audioBuffer && audioBuffer.length > 44) { + // WAV-Header parsen (erste 44 bytes) um PCM-Daten zu extrahieren + if (sampleRate === 0) { + channels = audioBuffer.readUInt16LE(22); + sampleRate = audioBuffer.readUInt32LE(24); + bitsPerSample = audioBuffer.readUInt16LE(34); + } + // PCM-Daten ab Byte 44 + pcmBuffers.push(audioBuffer.slice(44)); + log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes — "${chunk.slice(0, 40)}..."`); } } catch (chunkErr) { log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`); } } - log(`TTS komplett: ${chunks.length} Chunks gerendert`); + if (pcmBuffers.length === 0) { + log("TTS: Keine Audio-Daten erzeugt"); + sendToRVS({ + type: "xtts_response", + payload: { requestId, error: "Keine Audio-Daten" }, + timestamp: Date.now(), + }); + return; + } + + // PCM-Daten zusammenfuegen und neuen WAV-Header schreiben + const allPcm = Buffer.concat(pcmBuffers); + const wavHeader = Buffer.alloc(44); + const byteRate = sampleRate * channels * (bitsPerSample / 8); + const blockAlign = channels * (bitsPerSample / 8); + + wavHeader.write("RIFF", 0); + wavHeader.writeUInt32LE(36 + allPcm.length, 4); + wavHeader.write("WAVE", 8); + wavHeader.write("fmt ", 12); + wavHeader.writeUInt32LE(16, 16); // Subchunk1Size + wavHeader.writeUInt16LE(1, 20); // PCM format + wavHeader.writeUInt16LE(channels, 22); + wavHeader.writeUInt32LE(sampleRate, 24); + wavHeader.writeUInt32LE(byteRate, 28); + wavHeader.writeUInt16LE(blockAlign, 32); + wavHeader.writeUInt16LE(bitsPerSample, 34); + wavHeader.write("data", 36); + wavHeader.writeUInt32LE(allPcm.length, 40); + + const completeWav = Buffer.concat([wavHeader, allPcm]); + const base64 = completeWav.toString("base64"); + + // Wenn zu gross (>800KB PCM) → in Teile splitten, sonst als Ganzes senden + const MAX_PCM_SIZE = 800 * 1024; // ~800KB PCM pro Nachricht + const pcmParts = []; + if (allPcm.length > MAX_PCM_SIZE) { + for (let offset = 0; offset < allPcm.length; offset += MAX_PCM_SIZE) { + pcmParts.push(allPcm.slice(offset, Math.min(offset + MAX_PCM_SIZE, allPcm.length))); + } + } else { + pcmParts.push(allPcm); + } + + for (let p = 0; p < pcmParts.length; p++) { + const partPcm = pcmParts[p]; + const partHeader = Buffer.alloc(44); + partHeader.write("RIFF", 0); + partHeader.writeUInt32LE(36 + partPcm.length, 4); + partHeader.write("WAVE", 8); + partHeader.write("fmt ", 12); + partHeader.writeUInt32LE(16, 16); + partHeader.writeUInt16LE(1, 20); + partHeader.writeUInt16LE(channels, 22); + partHeader.writeUInt32LE(sampleRate, 24); + partHeader.writeUInt32LE(byteRate, 28); + partHeader.writeUInt16LE(blockAlign, 32); + partHeader.writeUInt16LE(bitsPerSample, 34); + partHeader.write("data", 36); + partHeader.writeUInt32LE(partPcm.length, 40); + + const partWav = Buffer.concat([partHeader, partPcm]); + const partBase64 = partWav.toString("base64"); + + sendToRVS({ + type: "xtts_response", + payload: { + requestId: `${requestId || ""}${pcmParts.length > 1 ? '_' + p : ''}`, + base64: partBase64, + mimeType: "audio/wav", + voice: voice || "default", + engine: "xtts", + }, + timestamp: Date.now(), + }); + } + + const totalSecs = (allPcm.length / byteRate).toFixed(1); + log(`TTS komplett: ${chunks.length} Chunks → ${pcmParts.length} Teil(e), ${(allPcm.length / 1024).toFixed(0)}KB, ${totalSecs}s`); } catch (err) { log(`TTS Fehler: ${err.message}`); sendToRVS({