fix: Streaming TTS — doppeltes Audio + Gaps zwischen Saetzen

Zwei Probleme gefunden:

1) DOPPELTES AUDIO (Kern-Ursache der Artefakte)
   aria-bridge hat audio_pcm von XTTS-Bridge empfangen und per
   _send_to_rvs rebroadcastet. RVS broadcast geht an ALLE Clients
   ausser Sender — die App bekam jeden Chunk also zwei mal:
     XTTS-Bridge → RVS → App + aria-bridge
     aria-bridge → RVS → App (nochmal!) + XTTS-Bridge
   Zwei ueberlagerte PCM-Streams klingen wie Doubled/Artefakte.
   Fix: aria-bridge ignoriert audio_pcm jetzt. messageId schickt
   XTTS-Bridge selbst im Payload (via xtts_request -> messageId).

2) GAPS ZWISCHEN SAETZEN (abgehackt)
   xtts/bridge.js teilte Text in ~150-char Chunks und rief pro Chunk
   einen eigenen /tts_to_audio/ Request. Zwischen Chunks lag die
   XTTS-Render-Zeit (1-3s) → hoerbare Pausen.
   Fix: cleanText geht JETZT in einem Request komplett an XTTS.
   Ein zusammenhaengender Stream → keine Satz-Gaps mehr.
   Kompromiss: Erste Samples kommen spaeter (ganze Text-Render dauert
   laenger als der erste Satz alleine), aber dann kontinuierlich
   ohne Unterbrechung.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-19 23:15:57 +02:00
parent 019c078393
commit 350069d371
2 changed files with 48 additions and 90 deletions
+6 -19
View File
@@ -1100,25 +1100,12 @@ class ARIABridge:
return return
elif msg_type == "audio_pcm": elif msg_type == "audio_pcm":
# XTTS-PCM-Stream vom Gaming-PC empfangen → durchleiten zur App. # Audio-PCM geht direkt von XTTS-Bridge an die App.
# Wenn in payload kein messageId (alte XTTS-Bridge), aus requestId auflösen. # Die aria-bridge darf es NICHT rebroadcasten — sonst bekommt die App
error = payload.get("error", "") # jeden Chunk doppelt (einmal direkt von XTTS-Bridge via RVS-Broadcast,
if error: # einmal indirekt via uns).
logger.warning("[rvs] XTTS PCM-Fehler: %s", error) # Wir ignorieren diese Message hier einfach — messageId wird von
return # XTTS-Bridge selbst im Payload mitgeliefert.
linked_message_id = payload.get("messageId", "")
if not linked_message_id:
req_id_full = payload.get("requestId", "")
req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
# Einfach 1:1 weiterleiten mit eingefuellter messageId
forwarded = dict(payload)
forwarded["messageId"] = linked_message_id
await self._send_to_rvs({
"type": "audio_pcm",
"payload": forwarded,
"timestamp": int(asyncio.get_event_loop().time() * 1000),
})
return return
elif msg_type == "xtts_response": elif msg_type == "xtts_response":
+42 -71
View File
@@ -116,87 +116,58 @@ async function handleTTSRequest(payload) {
.replace(/\(\)/g, "") .replace(/\(\)/g, "")
.trim(); .trim();
// Satzweise Chunks (XTTS Modell laedt Context pro Call — Saetze gruppieren) log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);
const sentences = cleanText.split(/(?<=[.!?])\s+/)
.map(s => s.trim())
.filter(s => s.length > 0)
.map(s => s.replace(/[.]+$/, ''));
const MAX_CHUNK_CHARS = 150;
const chunks = [];
let currentChunk = '';
for (const sentence of sentences) {
if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) {
chunks.push(currentChunk);
currentChunk = sentence;
} else {
currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence;
}
}
if (currentChunk) chunks.push(currentChunk);
if (chunks.length === 0) return;
log(`TTS-Request (streaming): "${cleanText.slice(0, 60)}..." (${chunks.length} Chunks, voice: ${voice || "default"})`);
try { try {
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null; const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample); const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
let chunkIndex = 0; let chunkIndex = 0;
// Audio-Format (aus WAV-Header extrahiert, einmal pro Request)
let pcmMeta = null; let pcmMeta = null;
for (let i = 0; i < chunks.length; i++) { // EIN Request fuer den GANZEN Text — kein Gap zwischen Saetzen.
const chunk = chunks[i]; // XTTS rendert und wir streamen PCM sobald es reinkommt.
const isLastChunk = i === chunks.length - 1; await streamXTTSAsPCM(
try { cleanText,
// Streaming: PCM-Frames werden nacheinander an RVS gepusht, language || "de",
// sobald sie vom XTTS-Server reinkommen hasCustomVoice ? voiceSample : null,
await streamXTTSAsPCM( (pcmBase64, meta) => {
chunk, if (!pcmMeta) pcmMeta = meta;
language || "de", sendToRVS({
hasCustomVoice ? voiceSample : null, type: "audio_pcm",
(pcmBase64, meta) => { payload: {
if (!pcmMeta) pcmMeta = meta; requestId: requestId || "",
sendToRVS({ messageId: messageId || "",
type: "audio_pcm", base64: pcmBase64,
payload: { format: "pcm_s16le",
requestId: requestId || "", sampleRate: meta.sampleRate,
messageId: messageId || "", channels: meta.channels,
base64: pcmBase64, voice: voice || "default",
format: "pcm_s16le", chunk: chunkIndex++,
sampleRate: meta.sampleRate, final: false,
channels: meta.channels,
voice: voice || "default",
chunk: chunkIndex++,
final: false,
},
timestamp: Date.now(),
});
}, },
); timestamp: Date.now(),
});
},
);
// Nach letztem Text-Chunk: final-Flag senden damit App weiss "fertig" // Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
if (isLastChunk && pcmMeta) { if (pcmMeta) {
sendToRVS({ sendToRVS({
type: "audio_pcm", type: "audio_pcm",
payload: { payload: {
requestId: requestId || "", requestId: requestId || "",
messageId: messageId || "", messageId: messageId || "",
base64: "", base64: "",
format: "pcm_s16le", format: "pcm_s16le",
sampleRate: pcmMeta.sampleRate, sampleRate: pcmMeta.sampleRate,
channels: pcmMeta.channels, channels: pcmMeta.channels,
voice: voice || "default", voice: voice || "default",
chunk: chunkIndex++, chunk: chunkIndex++,
final: true, final: true,
}, },
timestamp: Date.now(), timestamp: Date.now(),
}); });
}
} catch (chunkErr) {
log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
}
} }
log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`); log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);