fix: Streaming TTS — doppeltes Audio + Gaps zwischen Saetzen
Zwei Probleme gefunden:
1) DOPPELTES AUDIO (Kern-Ursache der Artefakte)
aria-bridge hat audio_pcm von XTTS-Bridge empfangen und per
_send_to_rvs rebroadcastet. RVS broadcast geht an ALLE Clients
ausser Sender — die App bekam jeden Chunk also zwei mal:
XTTS-Bridge → RVS → App + aria-bridge
aria-bridge → RVS → App (nochmal!) + XTTS-Bridge
Zwei ueberlagerte PCM-Streams klingen wie Doubled/Artefakte.
Fix: aria-bridge ignoriert audio_pcm jetzt. messageId schickt
XTTS-Bridge selbst im Payload (via xtts_request -> messageId).
2) GAPS ZWISCHEN SAETZEN (abgehackt)
xtts/bridge.js teilte Text in ~150-char Chunks und rief pro Chunk
einen eigenen /tts_to_audio/ Request. Zwischen Chunks lag die
XTTS-Render-Zeit (1-3s) → hoerbare Pausen.
Fix: cleanText geht JETZT in einem Request komplett an XTTS.
Ein zusammenhaengender Stream → keine Satz-Gaps mehr.
Kompromiss: Erste Samples kommen spaeter (ganze Text-Render dauert
laenger als der erste Satz alleine), aber dann kontinuierlich
ohne Unterbrechung.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+42
-71
@@ -116,87 +116,58 @@ async function handleTTSRequest(payload) {
|
||||
.replace(/\(\)/g, "")
|
||||
.trim();
|
||||
|
||||
// Satzweise Chunks (XTTS Modell laedt Context pro Call — Saetze gruppieren)
|
||||
const sentences = cleanText.split(/(?<=[.!?])\s+/)
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0)
|
||||
.map(s => s.replace(/[.]+$/, ''));
|
||||
|
||||
const MAX_CHUNK_CHARS = 150;
|
||||
const chunks = [];
|
||||
let currentChunk = '';
|
||||
for (const sentence of sentences) {
|
||||
if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) {
|
||||
chunks.push(currentChunk);
|
||||
currentChunk = sentence;
|
||||
} else {
|
||||
currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence;
|
||||
}
|
||||
}
|
||||
if (currentChunk) chunks.push(currentChunk);
|
||||
if (chunks.length === 0) return;
|
||||
|
||||
log(`TTS-Request (streaming): "${cleanText.slice(0, 60)}..." (${chunks.length} Chunks, voice: ${voice || "default"})`);
|
||||
log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);
|
||||
|
||||
try {
|
||||
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
||||
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
||||
|
||||
let chunkIndex = 0;
|
||||
// Audio-Format (aus WAV-Header extrahiert, einmal pro Request)
|
||||
let pcmMeta = null;
|
||||
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
const isLastChunk = i === chunks.length - 1;
|
||||
try {
|
||||
// Streaming: PCM-Frames werden nacheinander an RVS gepusht,
|
||||
// sobald sie vom XTTS-Server reinkommen
|
||||
await streamXTTSAsPCM(
|
||||
chunk,
|
||||
language || "de",
|
||||
hasCustomVoice ? voiceSample : null,
|
||||
(pcmBase64, meta) => {
|
||||
if (!pcmMeta) pcmMeta = meta;
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: pcmBase64,
|
||||
format: "pcm_s16le",
|
||||
sampleRate: meta.sampleRate,
|
||||
channels: meta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: false,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
// EIN Request fuer den GANZEN Text — kein Gap zwischen Saetzen.
|
||||
// XTTS rendert und wir streamen PCM sobald es reinkommt.
|
||||
await streamXTTSAsPCM(
|
||||
cleanText,
|
||||
language || "de",
|
||||
hasCustomVoice ? voiceSample : null,
|
||||
(pcmBase64, meta) => {
|
||||
if (!pcmMeta) pcmMeta = meta;
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: pcmBase64,
|
||||
format: "pcm_s16le",
|
||||
sampleRate: meta.sampleRate,
|
||||
channels: meta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: false,
|
||||
},
|
||||
);
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
// Nach letztem Text-Chunk: final-Flag senden damit App weiss "fertig"
|
||||
if (isLastChunk && pcmMeta) {
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: "",
|
||||
format: "pcm_s16le",
|
||||
sampleRate: pcmMeta.sampleRate,
|
||||
channels: pcmMeta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: true,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
}
|
||||
} catch (chunkErr) {
|
||||
log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
|
||||
}
|
||||
// Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
|
||||
if (pcmMeta) {
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: "",
|
||||
format: "pcm_s16le",
|
||||
sampleRate: pcmMeta.sampleRate,
|
||||
channels: pcmMeta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: true,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
}
|
||||
|
||||
log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);
|
||||
|
||||
Reference in New Issue
Block a user