fix: XTTS groups sentences into ~250 char chunks for consistent voice quality

- 2-3 sentences per chunk (more context = stable voice/volume)
- Max 250 chars per chunk (keeps WebSocket packets manageable)
- Dots re-added between sentences within a chunk (natural pauses)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-10 02:23:29 +02:00
parent 0428c06612
commit 8929bc99bb
+26 -12
View File
@@ -100,28 +100,42 @@ async function handleTTSRequest(payload) {
// Markdown entfernen // Markdown entfernen
const cleanText = text.replace(/\*\*([^*]+)\*\*/g, "$1").trim(); const cleanText = text.replace(/\*\*([^*]+)\*\*/g, "$1").trim();
// Text in Saetze aufteilen (sequentiell rendern fuer korrekte Reihenfolge) // Text in Saetze aufteilen, dann zu Chunks von 2-3 Saetzen zusammenfassen
// (mehr Kontext = konsistentere Stimme/Lautstaerke, aber nicht zu lang fuer WebSocket)
const sentences = cleanText.split(/(?<=[.!?])\s+/) const sentences = cleanText.split(/(?<=[.!?])\s+/)
.map(s => s.trim()) .map(s => s.trim())
.filter(s => s.length > 0) .filter(s => s.length > 0)
.map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen (XTTS liest ihn sonst vor) .map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen
if (sentences.length === 0) return;
log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze, voice: ${voice || "default"}, lang: ${language || "de"})`); const MAX_CHUNK_CHARS = 250; // Max ~250 Zeichen pro Chunk
const chunks = [];
let currentChunk = '';
for (const sentence of sentences) {
if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) {
chunks.push(currentChunk);
currentChunk = sentence;
} else {
currentChunk = currentChunk ? currentChunk + '. ' + sentence : sentence;
}
}
if (currentChunk) chunks.push(currentChunk);
if (chunks.length === 0) return;
log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze → ${chunks.length} Chunks, voice: ${voice || "default"}, lang: ${language || "de"})`);
try { try {
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null; const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample); const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
// Jeden Satz sequentiell rendern und sofort senden // Jeden Chunk sequentiell rendern und sofort senden
for (let i = 0; i < sentences.length; i++) { for (let i = 0; i < chunks.length; i++) {
const sentence = sentences[i]; const chunk = chunks[i];
try { try {
const audioBuffer = await callXTTSAPI(sentence, language || "de", hasCustomVoice ? voiceSample : null); const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null);
if (audioBuffer && audioBuffer.length > 100) { if (audioBuffer && audioBuffer.length > 100) {
const base64 = audioBuffer.toString("base64"); const base64 = audioBuffer.toString("base64");
log(`TTS [${i + 1}/${sentences.length}]: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB) — "${sentence.slice(0, 40)}..."`); log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB) — "${chunk.slice(0, 50)}..."`);
sendToRVS({ sendToRVS({
type: "xtts_response", type: "xtts_response",
@@ -135,12 +149,12 @@ async function handleTTSRequest(payload) {
timestamp: Date.now(), timestamp: Date.now(),
}); });
} }
} catch (sentenceErr) { } catch (chunkErr) {
log(`TTS [${i + 1}/${sentences.length}] Fehler: ${sentenceErr.message} — ueberspringe`); log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
} }
} }
log(`TTS komplett: ${sentences.length} Saetze gerendert`); log(`TTS komplett: ${chunks.length} Chunks gerendert`);
} catch (err) { } catch (err) {
log(`TTS Fehler: ${err.message}`); log(`TTS Fehler: ${err.message}`);
sendToRVS({ sendToRVS({