fix: Streaming TTS — doppeltes Audio + Gaps zwischen Saetzen
Zwei Probleme gefunden:
1) DOPPELTES AUDIO (Kern-Ursache der Artefakte)
aria-bridge hat audio_pcm von XTTS-Bridge empfangen und per
_send_to_rvs rebroadcastet. RVS broadcast geht an ALLE Clients
ausser Sender — die App bekam jeden Chunk also zwei mal:
XTTS-Bridge → RVS → App + aria-bridge
aria-bridge → RVS → App (nochmal!) + XTTS-Bridge
Zwei ueberlagerte PCM-Streams klingen wie Doubled/Artefakte.
Fix: aria-bridge ignoriert audio_pcm jetzt. messageId schickt
XTTS-Bridge selbst im Payload (via xtts_request -> messageId).
2) GAPS ZWISCHEN SAETZEN (abgehackt)
xtts/bridge.js teilte Text in ~150-char Chunks und rief pro Chunk
einen eigenen /tts_to_audio/ Request. Zwischen Chunks lag die
XTTS-Render-Zeit (1-3s) → hoerbare Pausen.
Fix: cleanText geht JETZT in einem Request komplett an XTTS.
Ein zusammenhaengender Stream → keine Satz-Gaps mehr.
Kompromiss: Erste Samples kommen spaeter (ganze Text-Render dauert
laenger als der erste Satz alleine), aber dann kontinuierlich
ohne Unterbrechung.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
019c078393
commit
350069d371
|
|
@ -1100,25 +1100,12 @@ class ARIABridge:
|
|||
return
|
||||
|
||||
elif msg_type == "audio_pcm":
|
||||
# XTTS-PCM-Stream vom Gaming-PC empfangen → durchleiten zur App.
|
||||
# Wenn in payload kein messageId (alte XTTS-Bridge), aus requestId auflösen.
|
||||
error = payload.get("error", "")
|
||||
if error:
|
||||
logger.warning("[rvs] XTTS PCM-Fehler: %s", error)
|
||||
return
|
||||
linked_message_id = payload.get("messageId", "")
|
||||
if not linked_message_id:
|
||||
req_id_full = payload.get("requestId", "")
|
||||
req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
|
||||
linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
|
||||
# Einfach 1:1 weiterleiten mit eingefuellter messageId
|
||||
forwarded = dict(payload)
|
||||
forwarded["messageId"] = linked_message_id
|
||||
await self._send_to_rvs({
|
||||
"type": "audio_pcm",
|
||||
"payload": forwarded,
|
||||
"timestamp": int(asyncio.get_event_loop().time() * 1000),
|
||||
})
|
||||
# Audio-PCM geht direkt von XTTS-Bridge an die App.
|
||||
# Die aria-bridge darf es NICHT rebroadcasten — sonst bekommt die App
|
||||
# jeden Chunk doppelt (einmal direkt von XTTS-Bridge via RVS-Broadcast,
|
||||
# einmal indirekt via uns).
|
||||
# Wir ignorieren diese Message hier einfach — messageId wird von
|
||||
# XTTS-Bridge selbst im Payload mitgeliefert.
|
||||
return
|
||||
|
||||
elif msg_type == "xtts_response":
|
||||
|
|
|
|||
113
xtts/bridge.js
113
xtts/bridge.js
|
|
@ -116,87 +116,58 @@ async function handleTTSRequest(payload) {
|
|||
.replace(/\(\)/g, "")
|
||||
.trim();
|
||||
|
||||
// Satzweise Chunks (XTTS Modell laedt Context pro Call — Saetze gruppieren)
|
||||
const sentences = cleanText.split(/(?<=[.!?])\s+/)
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0)
|
||||
.map(s => s.replace(/[.]+$/, ''));
|
||||
|
||||
const MAX_CHUNK_CHARS = 150;
|
||||
const chunks = [];
|
||||
let currentChunk = '';
|
||||
for (const sentence of sentences) {
|
||||
if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) {
|
||||
chunks.push(currentChunk);
|
||||
currentChunk = sentence;
|
||||
} else {
|
||||
currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence;
|
||||
}
|
||||
}
|
||||
if (currentChunk) chunks.push(currentChunk);
|
||||
if (chunks.length === 0) return;
|
||||
|
||||
log(`TTS-Request (streaming): "${cleanText.slice(0, 60)}..." (${chunks.length} Chunks, voice: ${voice || "default"})`);
|
||||
log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);
|
||||
|
||||
try {
|
||||
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
||||
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
||||
|
||||
let chunkIndex = 0;
|
||||
// Audio-Format (aus WAV-Header extrahiert, einmal pro Request)
|
||||
let pcmMeta = null;
|
||||
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
const isLastChunk = i === chunks.length - 1;
|
||||
try {
|
||||
// Streaming: PCM-Frames werden nacheinander an RVS gepusht,
|
||||
// sobald sie vom XTTS-Server reinkommen
|
||||
await streamXTTSAsPCM(
|
||||
chunk,
|
||||
language || "de",
|
||||
hasCustomVoice ? voiceSample : null,
|
||||
(pcmBase64, meta) => {
|
||||
if (!pcmMeta) pcmMeta = meta;
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: pcmBase64,
|
||||
format: "pcm_s16le",
|
||||
sampleRate: meta.sampleRate,
|
||||
channels: meta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: false,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
// EIN Request fuer den GANZEN Text — kein Gap zwischen Saetzen.
|
||||
// XTTS rendert und wir streamen PCM sobald es reinkommt.
|
||||
await streamXTTSAsPCM(
|
||||
cleanText,
|
||||
language || "de",
|
||||
hasCustomVoice ? voiceSample : null,
|
||||
(pcmBase64, meta) => {
|
||||
if (!pcmMeta) pcmMeta = meta;
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: pcmBase64,
|
||||
format: "pcm_s16le",
|
||||
sampleRate: meta.sampleRate,
|
||||
channels: meta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: false,
|
||||
},
|
||||
);
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
// Nach letztem Text-Chunk: final-Flag senden damit App weiss "fertig"
|
||||
if (isLastChunk && pcmMeta) {
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: "",
|
||||
format: "pcm_s16le",
|
||||
sampleRate: pcmMeta.sampleRate,
|
||||
channels: pcmMeta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: true,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
}
|
||||
} catch (chunkErr) {
|
||||
log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
|
||||
}
|
||||
// Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
|
||||
if (pcmMeta) {
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: "",
|
||||
format: "pcm_s16le",
|
||||
sampleRate: pcmMeta.sampleRate,
|
||||
channels: pcmMeta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: true,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
}
|
||||
|
||||
log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);
|
||||
|
|
|
|||
Loading…
Reference in New Issue