From 8929bc99bbf39eea1edd058ea13c3348279d988e Mon Sep 17 00:00:00 2001 From: duffyduck Date: Fri, 10 Apr 2026 02:23:29 +0200 Subject: [PATCH] fix: XTTS groups sentences into ~250 char chunks for consistent voice quality - 2-3 sentences per chunk (more context = stable voice/volume) - Max 250 chars per chunk (keeps WebSocket packets manageable) - Dots re-added between sentences within a chunk (natural pauses) Co-Authored-By: Claude Opus 4.6 (1M context) --- xtts/bridge.js | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/xtts/bridge.js b/xtts/bridge.js index 0300474..e1c5464 100644 --- a/xtts/bridge.js +++ b/xtts/bridge.js @@ -100,28 +100,42 @@ async function handleTTSRequest(payload) { // Markdown entfernen const cleanText = text.replace(/\*\*([^*]+)\*\*/g, "$1").trim(); - // Text in Saetze aufteilen (sequentiell rendern fuer korrekte Reihenfolge) + // Text in Saetze aufteilen, dann zu Chunks von 2-3 Saetzen zusammenfassen + // (mehr Kontext = konsistentere Stimme/Lautstaerke, aber nicht zu lang fuer WebSocket) const sentences = cleanText.split(/(?<=[.!?])\s+/) .map(s => s.trim()) .filter(s => s.length > 0) - .map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen (XTTS liest ihn sonst vor) - if (sentences.length === 0) return; + .map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen - log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze, voice: ${voice || "default"}, lang: ${language || "de"})`); + const MAX_CHUNK_CHARS = 250; // Max ~250 Zeichen pro Chunk + const chunks = []; + let currentChunk = ''; + for (const sentence of sentences) { + if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) { + chunks.push(currentChunk); + currentChunk = sentence; + } else { + currentChunk = currentChunk ? currentChunk + '. ' + sentence : sentence; + } + } + if (currentChunk) chunks.push(currentChunk); + if (chunks.length === 0) return; + + log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze → ${chunks.length} Chunks, voice: ${voice || "default"}, lang: ${language || "de"})`); try { const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null; const hasCustomVoice = voiceSample && fs.existsSync(voiceSample); - // Jeden Satz sequentiell rendern und sofort senden - for (let i = 0; i < sentences.length; i++) { - const sentence = sentences[i]; + // Jeden Chunk sequentiell rendern und sofort senden + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; try { - const audioBuffer = await callXTTSAPI(sentence, language || "de", hasCustomVoice ? voiceSample : null); + const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null); if (audioBuffer && audioBuffer.length > 100) { const base64 = audioBuffer.toString("base64"); - log(`TTS [${i + 1}/${sentences.length}]: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB) — "${sentence.slice(0, 40)}..."`); + log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB) — "${chunk.slice(0, 50)}..."`); sendToRVS({ type: "xtts_response", @@ -135,12 +149,12 @@ async function handleTTSRequest(payload) { timestamp: Date.now(), }); } - } catch (sentenceErr) { - log(`TTS [${i + 1}/${sentences.length}] Fehler: ${sentenceErr.message} — ueberspringe`); + } catch (chunkErr) { + log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`); } } - log(`TTS komplett: ${sentences.length} Saetze gerendert`); + log(`TTS komplett: ${chunks.length} Chunks gerendert`); } catch (err) { log(`TTS Fehler: ${err.message}`); sendToRVS({