fix: XTTS-Endpoint mit Fallback-Chain + Diagnose-Logs

Problem: /tts_stream hat bei User nicht funktioniert → keine
Sprachausgabe mehr. Server hatte vorher 405 fuer POST geantwortet,
meine Umstellung auf GET scheint aber einen anderen Fehler zu
produzieren der nicht geloggt wurde.

Fix:
- streamXTTSAsPCM() = /tts_stream (GET, Streaming) mit ausfuehrlichem
  Error-Logging bei non-200 Response
- streamXTTSBatch() = /tts_to_audio/ (POST, Batch) als Fallback
- handleTTSRequest versucht Stream zuerst, bei Exception Fallback
  auf Batch — so gibt's IMMER Audio, auch wenn /tts_stream kaputt ist
- Log zeigt welcher Pfad benutzt wurde

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
duffyduck 2026-04-22 15:53:10 +02:00
parent 9b5a35cb4a
commit c62ceafdc2
1 changed files with 128 additions and 40 deletions

View File

@ -138,31 +138,43 @@ async function _runTTSRequest(payload) {
let chunkIndex = 0;
let pcmMeta = null;
// EIN Request fuer den GANZEN Text — kein Gap zwischen Saetzen.
// XTTS rendert und wir streamen PCM sobald es reinkommt.
await streamXTTSAsPCM(
cleanText,
language || "de",
hasCustomVoice ? voiceSample : null,
(pcmBase64, meta) => {
if (!pcmMeta) pcmMeta = meta;
sendToRVS({
type: "audio_pcm",
payload: {
requestId: requestId || "",
messageId: messageId || "",
base64: pcmBase64,
format: "pcm_s16le",
sampleRate: meta.sampleRate,
channels: meta.channels,
voice: voice || "default",
chunk: chunkIndex++,
final: false,
},
timestamp: Date.now(),
});
},
);
const onChunk = (pcmBase64, meta) => {
if (!pcmMeta) pcmMeta = meta;
sendToRVS({
type: "audio_pcm",
payload: {
requestId: requestId || "",
messageId: messageId || "",
base64: pcmBase64,
format: "pcm_s16le",
sampleRate: meta.sampleRate,
channels: meta.channels,
voice: voice || "default",
chunk: chunkIndex++,
final: false,
},
timestamp: Date.now(),
});
};
// Erst /tts_stream (GET) versuchen — echter Streaming, schnell.
// Bei Fehler Fallback auf /tts_to_audio/ (POST) damit Audio trotzdem kommt.
try {
await streamXTTSAsPCM(
cleanText,
language || "de",
hasCustomVoice ? voiceSample : null,
onChunk,
);
} catch (streamErr) {
log(`/tts_stream fehlgeschlagen (${streamErr.message}) — Fallback auf /tts_to_audio/`);
await streamXTTSBatch(
cleanText,
language || "de",
hasCustomVoice ? voiceSample : null,
onChunk,
);
}
// Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
if (pcmMeta) {
@ -195,37 +207,42 @@ async function _runTTSRequest(payload) {
}
/**
* Ruft /tts_stream (GET) auf echter Streaming-Endpoint bei daswer123.
* Samples fliessen waehrend XTTS rendert (chunked transfer).
* Time-to-first-audio ~300-500ms statt 2-4s beim batch-Endpoint.
*
* Parameter werden als Query-String uebergeben (GET-API).
* Ruft /tts_stream auf echter Streaming-Endpoint bei daswer123.
* Schickt was der Server verlangt (allow: GET), aber mit JSON-Body
* als POST scheitert mit 405. Manche Versionen wollen GET + Query,
* andere POST + JSON. Testen was funktioniert.
*/
function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
return new Promise((resolve, reject) => {
const qs = new URLSearchParams({
text,
language: language || "de",
speaker_wav: speakerWav ? speakerWav : "",
stream_chunk_size: "40",
});
const qs = new URLSearchParams();
qs.set("text", text);
qs.set("language", language || "de");
if (speakerWav) qs.set("speaker_wav", speakerWav);
qs.set("stream_chunk_size", "40");
const url = new URL(`${XTTS_API_URL}/tts_stream?${qs.toString()}`);
const url = new URL(XTTS_API_URL);
const fullPath = `/tts_stream?${qs.toString()}`;
const options = {
hostname: url.hostname,
port: url.port,
path: `${url.pathname}?${url.searchParams.toString()}`,
port: url.port || 80,
path: fullPath,
method: "GET",
timeout: 60000,
};
log(`TTS GET /tts_stream?text=${text.slice(0, 30)}... (voice=${speakerWav ? "custom" : "default"})`);
const req = http.request(options, (res) => {
if (res.statusCode !== 200) {
let body = "";
res.on("data", (d) => { body += d.toString(); });
res.on("end", () => reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`)));
res.on("end", () => {
log(`XTTS /tts_stream ${res.statusCode}: ${body.slice(0, 300)}`);
reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`));
});
return;
}
log(`TTS stream verbunden, empfange PCM...`);
let headerParsed = false;
let sampleRate = 24000;
@ -281,6 +298,77 @@ function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
});
}
/**
* Fallback: /tts_to_audio/ (POST JSON) rendert komplett, dann response.
* Kein echtes Streaming, aber stabil als Backup wenn /tts_stream nicht geht.
* Shared chunking-Logik mit streamXTTSAsPCM parst WAV-Header, stueckelt PCM.
*/
function streamXTTSBatch(text, language, speakerWav, onPcmChunk) {
return new Promise((resolve, reject) => {
const body = JSON.stringify({
text,
language: language || "de",
speaker_wav: speakerWav || "",
});
const url = new URL(XTTS_API_URL);
const options = {
hostname: url.hostname,
port: url.port || 80,
path: "/tts_to_audio/",
method: "POST",
headers: {
"Content-Type": "application/json",
"Content-Length": Buffer.byteLength(body),
},
timeout: 60000,
};
const req = http.request(options, (res) => {
if (res.statusCode !== 200) {
let rb = "";
res.on("data", (d) => { rb += d.toString(); });
res.on("end", () => reject(new Error(`XTTS Batch HTTP ${res.statusCode}: ${rb.slice(0, 200)}`)));
return;
}
let headerParsed = false;
let sampleRate = 24000;
let channels = 1;
let leftover = Buffer.alloc(0);
let headerBuf = Buffer.alloc(0);
const HEADER_BYTES = 44;
const PCM_CHUNK_BYTES = 8192;
res.on("data", (chunk) => {
let data = chunk;
if (!headerParsed) {
headerBuf = Buffer.concat([headerBuf, data]);
if (headerBuf.length < HEADER_BYTES) return;
const header = headerBuf.slice(0, HEADER_BYTES);
try { channels = header.readUInt16LE(22); sampleRate = header.readUInt32LE(24); } catch (_) {}
headerParsed = true;
data = headerBuf.slice(HEADER_BYTES);
}
let combined = Buffer.concat([leftover, data]);
while (combined.length >= PCM_CHUNK_BYTES) {
const slice = combined.slice(0, PCM_CHUNK_BYTES);
combined = combined.slice(PCM_CHUNK_BYTES);
onPcmChunk(slice.toString("base64"), { sampleRate, channels });
}
leftover = combined;
});
res.on("end", () => {
if (leftover.length > 0) onPcmChunk(leftover.toString("base64"), { sampleRate, channels });
resolve();
});
res.on("error", reject);
});
req.on("error", reject);
req.on("timeout", () => { req.destroy(); reject(new Error("XTTS Batch Timeout (60s)")); });
req.write(body);
req.end();
});
}
// ── Voice Upload Handler ────────────────────────────
async function handleVoiceUpload(payload) {