From f7f450a09d86098e129f84a49195b0496722d19c Mon Sep 17 00:00:00 2001 From: duffyduck Date: Fri, 10 Apr 2026 02:48:50 +0200 Subject: [PATCH] fix: XTTS streaming mode - send each chunk immediately, comma between sentences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Back to streaming: render chunk → send immediately → next chunk - App plays with preloading queue (no waiting for all chunks) - Comma instead of dot between sentences in chunk (no "Punkt" read aloud) - Sentence-ending dots already removed Co-Authored-By: Claude Opus 4.6 (1M context) --- xtts/bridge.js | 119 +++++++++---------------------------------------- 1 file changed, 22 insertions(+), 97 deletions(-) diff --git a/xtts/bridge.js b/xtts/bridge.js index 5b7d673..84fb20b 100644 --- a/xtts/bridge.js +++ b/xtts/bridge.js @@ -115,7 +115,7 @@ async function handleTTSRequest(payload) { chunks.push(currentChunk); currentChunk = sentence; } else { - currentChunk = currentChunk ? currentChunk + '. ' + sentence : sentence; + currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence; } } if (currentChunk) chunks.push(currentChunk); @@ -127,114 +127,39 @@ async function handleTTSRequest(payload) { const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null; const hasCustomVoice = voiceSample && fs.existsSync(voiceSample); - // Alle Chunks sequentiell rendern und PCM-Daten sammeln - const pcmBuffers = []; - let sampleRate = 0; - let channels = 0; - let bitsPerSample = 0; + // Streaming: Chunk rendern → sofort senden → naechster Chunk + // App spielt mit Preloading-Queue nahtlos ab + let sentCount = 0; for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; try { const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null); - if (audioBuffer && audioBuffer.length > 44) { - // WAV-Header parsen (erste 44 bytes) um PCM-Daten zu extrahieren - if (sampleRate === 0) { - channels = audioBuffer.readUInt16LE(22); - sampleRate = audioBuffer.readUInt32LE(24); - bitsPerSample = audioBuffer.readUInt16LE(34); - } - // PCM-Daten ab Byte 44 - pcmBuffers.push(audioBuffer.slice(44)); - log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes — "${chunk.slice(0, 40)}..."`); + if (audioBuffer && audioBuffer.length > 100) { + log(`TTS [${i + 1}/${chunks.length}]: ${(audioBuffer.length / 1024).toFixed(0)}KB — "${chunk.slice(0, 50)}"`); + + sendToRVS({ + type: "xtts_response", + payload: { + requestId: `${requestId || ""}_${i}`, + base64: audioBuffer.toString("base64"), + mimeType: "audio/wav", + voice: voice || "default", + engine: "xtts", + part: i + 1, + totalParts: chunks.length, + }, + timestamp: Date.now(), + }); + sentCount++; } } catch (chunkErr) { log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`); } } - if (pcmBuffers.length === 0) { - log("TTS: Keine Audio-Daten erzeugt"); - sendToRVS({ - type: "xtts_response", - payload: { requestId, error: "Keine Audio-Daten" }, - timestamp: Date.now(), - }); - return; - } - - // PCM-Daten zusammenfuegen und neuen WAV-Header schreiben - const allPcm = Buffer.concat(pcmBuffers); - const wavHeader = Buffer.alloc(44); - const byteRate = sampleRate * channels * (bitsPerSample / 8); - const blockAlign = channels * (bitsPerSample / 8); - - wavHeader.write("RIFF", 0); - wavHeader.writeUInt32LE(36 + allPcm.length, 4); - wavHeader.write("WAVE", 8); - wavHeader.write("fmt ", 12); - wavHeader.writeUInt32LE(16, 16); // Subchunk1Size - wavHeader.writeUInt16LE(1, 20); // PCM format - wavHeader.writeUInt16LE(channels, 22); - wavHeader.writeUInt32LE(sampleRate, 24); - wavHeader.writeUInt32LE(byteRate, 28); - wavHeader.writeUInt16LE(blockAlign, 32); - wavHeader.writeUInt16LE(bitsPerSample, 34); - wavHeader.write("data", 36); - wavHeader.writeUInt32LE(allPcm.length, 40); - - const completeWav = Buffer.concat([wavHeader, allPcm]); - const base64 = completeWav.toString("base64"); - - // In ~8 Sekunden Teile splitten (nahtlos genug fuer Queue, klein genug fuer WebSocket) - const samplesPerSec = sampleRate * channels * (bitsPerSample / 8); - const TARGET_SECS = 8; // ~8 Sekunden pro Teil - const targetBytes = samplesPerSec * TARGET_SECS; - - const pcmParts = []; - for (let offset = 0; offset < allPcm.length; offset += targetBytes) { - pcmParts.push(allPcm.slice(offset, Math.min(offset + targetBytes, allPcm.length))); - } - - function buildWav(pcmData) { - const header = Buffer.alloc(44); - header.write("RIFF", 0); - header.writeUInt32LE(36 + pcmData.length, 4); - header.write("WAVE", 8); - header.write("fmt ", 12); - header.writeUInt32LE(16, 16); - header.writeUInt16LE(1, 20); - header.writeUInt16LE(channels, 22); - header.writeUInt32LE(sampleRate, 24); - header.writeUInt32LE(byteRate, 28); - header.writeUInt16LE(blockAlign, 32); - header.writeUInt16LE(bitsPerSample, 34); - header.write("data", 36); - header.writeUInt32LE(pcmData.length, 40); - return Buffer.concat([header, pcmData]); - } - - for (let p = 0; p < pcmParts.length; p++) { - const partWav = buildWav(pcmParts[p]); - - sendToRVS({ - type: "xtts_response", - payload: { - requestId: `${requestId || ""}${pcmParts.length > 1 ? '_' + p : ''}`, - base64: partWav.toString("base64"), - mimeType: "audio/wav", - voice: voice || "default", - engine: "xtts", - part: p + 1, - totalParts: pcmParts.length, - }, - timestamp: Date.now(), - }); - } - - const totalSecs = (allPcm.length / byteRate).toFixed(1); - log(`TTS komplett: ${chunks.length} Chunks → ${pcmParts.length} Teil(e), ${(allPcm.length / 1024).toFixed(0)}KB, ${totalSecs}s`); + log(`TTS komplett: ${sentCount}/${chunks.length} Chunks gestreamt`); } catch (err) { log(`TTS Fehler: ${err.message}`); sendToRVS({