From b3d3b8b6bc353662782bd38dda4a3162ae7978cb Mon Sep 17 00:00:00 2001 From: duffyduck Date: Fri, 10 Apr 2026 02:03:29 +0200 Subject: [PATCH] fix: XTTS bridge splits text into sentences sequentially - XTTS-Bridge does sentence splitting (not ARIA-Bridge) - Sequential rendering: correct order guaranteed - Each sentence sent as separate xtts_response - Markdown removal before splitting - App starts playback after first sentence (faster UX) Co-Authored-By: Claude Opus 4.6 (1M context) --- bridge/aria_bridge.py | 31 +++++++++-------------- xtts/bridge.js | 58 ++++++++++++++++++++++++------------------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/bridge/aria_bridge.py b/bridge/aria_bridge.py index 789effc..3e1331e 100644 --- a/bridge/aria_bridge.py +++ b/bridge/aria_bridge.py @@ -851,27 +851,20 @@ class ARIABridge: tts_engine = getattr(self, 'tts_engine_type', 'piper') if tts_engine == "xtts": - # XTTS: Lange Texte satzweise aufteilen (WebSocket-Limit + schnellere Wiedergabe) - import re as _re + # XTTS: Ganzen Text senden, XTTS-Bridge teilt satzweise auf xtts_voice = getattr(self, 'xtts_voice', '') - clean_text = _re.sub(r'\*\*([^*]+)\*\*', r'\1', text).strip() - sentences = _re.split(r'(?<=[.!?])\s+', clean_text) - sentences = [s.strip() for s in sentences if s.strip()] - if not sentences: - sentences = [clean_text] try: - for sentence in sentences: - await self._send_to_rvs({ - "type": "xtts_request", - "payload": { - "text": sentence, - "voice": xtts_voice, - "language": "de", - "requestId": str(uuid.uuid4()), - }, - "timestamp": int(asyncio.get_event_loop().time() * 1000), - }) - logger.info("[core] XTTS-Request gesendet (%s, %d Saetze): '%s'", xtts_voice or "default", len(sentences), text[:60]) + await self._send_to_rvs({ + "type": "xtts_request", + "payload": { + "text": text, + "voice": xtts_voice, + "language": "de", + "requestId": str(uuid.uuid4()), + }, + "timestamp": int(asyncio.get_event_loop().time() * 1000), + }) + logger.info("[core] XTTS-Request gesendet (%s): '%s'", xtts_voice or "default", text[:60]) except Exception as e: logger.warning("[core] XTTS-Request fehlgeschlagen: %s — Fallback auf Piper", e) # Fallback auf Piper diff --git a/xtts/bridge.js b/xtts/bridge.js index 12d271b..7a1e04d 100644 --- a/xtts/bridge.js +++ b/xtts/bridge.js @@ -97,39 +97,47 @@ async function handleTTSRequest(payload) { const { text, voice, requestId, language } = payload; if (!text) return; - log(`TTS-Request: "${text.slice(0, 60)}..." (voice: ${voice || "default"}, lang: ${language || "de"})`); + // Markdown entfernen + const cleanText = text.replace(/\*\*([^*]+)\*\*/g, "$1").trim(); + + // Text in Saetze aufteilen (sequentiell rendern fuer korrekte Reihenfolge) + const sentences = cleanText.split(/(?<=[.!?])\s+/).map(s => s.trim()).filter(s => s.length > 0); + if (sentences.length === 0) return; + + log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze, voice: ${voice || "default"}, lang: ${language || "de"})`); try { - // Voice-Sample Pfad bestimmen const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null; const hasCustomVoice = voiceSample && fs.existsSync(voiceSample); - // XTTS API aufrufen - const audioBuffer = await callXTTSAPI(text, language || "de", hasCustomVoice ? voiceSample : null); + // Jeden Satz sequentiell rendern und sofort senden + for (let i = 0; i < sentences.length; i++) { + const sentence = sentences[i]; + try { + const audioBuffer = await callXTTSAPI(sentence, language || "de", hasCustomVoice ? voiceSample : null); - if (audioBuffer && audioBuffer.length > 100) { - const base64 = audioBuffer.toString("base64"); - log(`TTS fertig: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB)`); + if (audioBuffer && audioBuffer.length > 100) { + const base64 = audioBuffer.toString("base64"); + log(`TTS [${i + 1}/${sentences.length}]: ${audioBuffer.length} bytes (${(audioBuffer.length / 1024).toFixed(0)}KB) — "${sentence.slice(0, 40)}..."`); - sendToRVS({ - type: "xtts_response", - payload: { - requestId: requestId || "", - base64, - mimeType: "audio/wav", - voice: voice || "default", - engine: "xtts", - }, - timestamp: Date.now(), - }); - } else { - log("TTS: Leeres Audio erhalten"); - sendToRVS({ - type: "xtts_response", - payload: { requestId, error: "Leeres Audio" }, - timestamp: Date.now(), - }); + sendToRVS({ + type: "xtts_response", + payload: { + requestId: `${requestId || ""}_${i}`, + base64, + mimeType: "audio/wav", + voice: voice || "default", + engine: "xtts", + }, + timestamp: Date.now(), + }); + } + } catch (sentenceErr) { + log(`TTS [${i + 1}/${sentences.length}] Fehler: ${sentenceErr.message} — ueberspringe`); + } } + + log(`TTS komplett: ${sentences.length} Saetze gerendert`); } catch (err) { log(`TTS Fehler: ${err.message}`); sendToRVS({