fix: Streaming TTS — doppeltes Audio + Gaps zwischen Saetzen
Zwei Probleme gefunden:
1) DOPPELTES AUDIO (Kern-Ursache der Artefakte)
aria-bridge hat audio_pcm von XTTS-Bridge empfangen und per
_send_to_rvs rebroadcastet. RVS broadcast geht an ALLE Clients
ausser Sender — die App bekam jeden Chunk also zwei mal:
XTTS-Bridge → RVS → App + aria-bridge
aria-bridge → RVS → App (nochmal!) + XTTS-Bridge
Zwei ueberlagerte PCM-Streams klingen wie Doubled/Artefakte.
Fix: aria-bridge ignoriert audio_pcm jetzt. messageId schickt
XTTS-Bridge selbst im Payload (via xtts_request -> messageId).
2) GAPS ZWISCHEN SAETZEN (abgehackt)
xtts/bridge.js teilte Text in ~150-char Chunks und rief pro Chunk
einen eigenen /tts_to_audio/ Request. Zwischen Chunks lag die
XTTS-Render-Zeit (1-3s) → hoerbare Pausen.
Fix: cleanText geht JETZT in einem Request komplett an XTTS.
Ein zusammenhaengender Stream → keine Satz-Gaps mehr.
Kompromiss: Erste Samples kommen spaeter (ganze Text-Render dauert
laenger als der erste Satz alleine), aber dann kontinuierlich
ohne Unterbrechung.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+6
-19
@@ -1100,25 +1100,12 @@ class ARIABridge:
|
|||||||
return
|
return
|
||||||
|
|
||||||
elif msg_type == "audio_pcm":
|
elif msg_type == "audio_pcm":
|
||||||
# XTTS-PCM-Stream vom Gaming-PC empfangen → durchleiten zur App.
|
# Audio-PCM geht direkt von XTTS-Bridge an die App.
|
||||||
# Wenn in payload kein messageId (alte XTTS-Bridge), aus requestId auflösen.
|
# Die aria-bridge darf es NICHT rebroadcasten — sonst bekommt die App
|
||||||
error = payload.get("error", "")
|
# jeden Chunk doppelt (einmal direkt von XTTS-Bridge via RVS-Broadcast,
|
||||||
if error:
|
# einmal indirekt via uns).
|
||||||
logger.warning("[rvs] XTTS PCM-Fehler: %s", error)
|
# Wir ignorieren diese Message hier einfach — messageId wird von
|
||||||
return
|
# XTTS-Bridge selbst im Payload mitgeliefert.
|
||||||
linked_message_id = payload.get("messageId", "")
|
|
||||||
if not linked_message_id:
|
|
||||||
req_id_full = payload.get("requestId", "")
|
|
||||||
req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
|
|
||||||
linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
|
|
||||||
# Einfach 1:1 weiterleiten mit eingefuellter messageId
|
|
||||||
forwarded = dict(payload)
|
|
||||||
forwarded["messageId"] = linked_message_id
|
|
||||||
await self._send_to_rvs({
|
|
||||||
"type": "audio_pcm",
|
|
||||||
"payload": forwarded,
|
|
||||||
"timestamp": int(asyncio.get_event_loop().time() * 1000),
|
|
||||||
})
|
|
||||||
return
|
return
|
||||||
|
|
||||||
elif msg_type == "xtts_response":
|
elif msg_type == "xtts_response":
|
||||||
|
|||||||
+42
-71
@@ -116,87 +116,58 @@ async function handleTTSRequest(payload) {
|
|||||||
.replace(/\(\)/g, "")
|
.replace(/\(\)/g, "")
|
||||||
.trim();
|
.trim();
|
||||||
|
|
||||||
// Satzweise Chunks (XTTS Modell laedt Context pro Call — Saetze gruppieren)
|
log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);
|
||||||
const sentences = cleanText.split(/(?<=[.!?])\s+/)
|
|
||||||
.map(s => s.trim())
|
|
||||||
.filter(s => s.length > 0)
|
|
||||||
.map(s => s.replace(/[.]+$/, ''));
|
|
||||||
|
|
||||||
const MAX_CHUNK_CHARS = 150;
|
|
||||||
const chunks = [];
|
|
||||||
let currentChunk = '';
|
|
||||||
for (const sentence of sentences) {
|
|
||||||
if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) {
|
|
||||||
chunks.push(currentChunk);
|
|
||||||
currentChunk = sentence;
|
|
||||||
} else {
|
|
||||||
currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (currentChunk) chunks.push(currentChunk);
|
|
||||||
if (chunks.length === 0) return;
|
|
||||||
|
|
||||||
log(`TTS-Request (streaming): "${cleanText.slice(0, 60)}..." (${chunks.length} Chunks, voice: ${voice || "default"})`);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
||||||
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
||||||
|
|
||||||
let chunkIndex = 0;
|
let chunkIndex = 0;
|
||||||
// Audio-Format (aus WAV-Header extrahiert, einmal pro Request)
|
|
||||||
let pcmMeta = null;
|
let pcmMeta = null;
|
||||||
|
|
||||||
for (let i = 0; i < chunks.length; i++) {
|
// EIN Request fuer den GANZEN Text — kein Gap zwischen Saetzen.
|
||||||
const chunk = chunks[i];
|
// XTTS rendert und wir streamen PCM sobald es reinkommt.
|
||||||
const isLastChunk = i === chunks.length - 1;
|
await streamXTTSAsPCM(
|
||||||
try {
|
cleanText,
|
||||||
// Streaming: PCM-Frames werden nacheinander an RVS gepusht,
|
language || "de",
|
||||||
// sobald sie vom XTTS-Server reinkommen
|
hasCustomVoice ? voiceSample : null,
|
||||||
await streamXTTSAsPCM(
|
(pcmBase64, meta) => {
|
||||||
chunk,
|
if (!pcmMeta) pcmMeta = meta;
|
||||||
language || "de",
|
sendToRVS({
|
||||||
hasCustomVoice ? voiceSample : null,
|
type: "audio_pcm",
|
||||||
(pcmBase64, meta) => {
|
payload: {
|
||||||
if (!pcmMeta) pcmMeta = meta;
|
requestId: requestId || "",
|
||||||
sendToRVS({
|
messageId: messageId || "",
|
||||||
type: "audio_pcm",
|
base64: pcmBase64,
|
||||||
payload: {
|
format: "pcm_s16le",
|
||||||
requestId: requestId || "",
|
sampleRate: meta.sampleRate,
|
||||||
messageId: messageId || "",
|
channels: meta.channels,
|
||||||
base64: pcmBase64,
|
voice: voice || "default",
|
||||||
format: "pcm_s16le",
|
chunk: chunkIndex++,
|
||||||
sampleRate: meta.sampleRate,
|
final: false,
|
||||||
channels: meta.channels,
|
|
||||||
voice: voice || "default",
|
|
||||||
chunk: chunkIndex++,
|
|
||||||
final: false,
|
|
||||||
},
|
|
||||||
timestamp: Date.now(),
|
|
||||||
});
|
|
||||||
},
|
},
|
||||||
);
|
timestamp: Date.now(),
|
||||||
|
});
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
// Nach letztem Text-Chunk: final-Flag senden damit App weiss "fertig"
|
// Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
|
||||||
if (isLastChunk && pcmMeta) {
|
if (pcmMeta) {
|
||||||
sendToRVS({
|
sendToRVS({
|
||||||
type: "audio_pcm",
|
type: "audio_pcm",
|
||||||
payload: {
|
payload: {
|
||||||
requestId: requestId || "",
|
requestId: requestId || "",
|
||||||
messageId: messageId || "",
|
messageId: messageId || "",
|
||||||
base64: "",
|
base64: "",
|
||||||
format: "pcm_s16le",
|
format: "pcm_s16le",
|
||||||
sampleRate: pcmMeta.sampleRate,
|
sampleRate: pcmMeta.sampleRate,
|
||||||
channels: pcmMeta.channels,
|
channels: pcmMeta.channels,
|
||||||
voice: voice || "default",
|
voice: voice || "default",
|
||||||
chunk: chunkIndex++,
|
chunk: chunkIndex++,
|
||||||
final: true,
|
final: true,
|
||||||
},
|
},
|
||||||
timestamp: Date.now(),
|
timestamp: Date.now(),
|
||||||
});
|
});
|
||||||
}
|
|
||||||
} catch (chunkErr) {
|
|
||||||
log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);
|
log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);
|
||||||
|
|||||||
Reference in New Issue
Block a user