fix: XTTS streaming mode - send each chunk immediately, comma between sentences
- Back to streaming: render chunk → send immediately → next chunk - App plays with preloading queue (no waiting for all chunks) - Comma instead of dot between sentences in chunk (no "Punkt" read aloud) - Sentence-ending dots already removed Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
81f7c38383
commit
f7f450a09d
119
xtts/bridge.js
119
xtts/bridge.js
|
|
@ -115,7 +115,7 @@ async function handleTTSRequest(payload) {
|
||||||
chunks.push(currentChunk);
|
chunks.push(currentChunk);
|
||||||
currentChunk = sentence;
|
currentChunk = sentence;
|
||||||
} else {
|
} else {
|
||||||
currentChunk = currentChunk ? currentChunk + '. ' + sentence : sentence;
|
currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (currentChunk) chunks.push(currentChunk);
|
if (currentChunk) chunks.push(currentChunk);
|
||||||
|
|
@ -127,114 +127,39 @@ async function handleTTSRequest(payload) {
|
||||||
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
||||||
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
||||||
|
|
||||||
// Alle Chunks sequentiell rendern und PCM-Daten sammeln
|
// Streaming: Chunk rendern → sofort senden → naechster Chunk
|
||||||
const pcmBuffers = [];
|
// App spielt mit Preloading-Queue nahtlos ab
|
||||||
let sampleRate = 0;
|
let sentCount = 0;
|
||||||
let channels = 0;
|
|
||||||
let bitsPerSample = 0;
|
|
||||||
|
|
||||||
for (let i = 0; i < chunks.length; i++) {
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
const chunk = chunks[i];
|
const chunk = chunks[i];
|
||||||
try {
|
try {
|
||||||
const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null);
|
const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null);
|
||||||
|
|
||||||
if (audioBuffer && audioBuffer.length > 44) {
|
if (audioBuffer && audioBuffer.length > 100) {
|
||||||
// WAV-Header parsen (erste 44 bytes) um PCM-Daten zu extrahieren
|
log(`TTS [${i + 1}/${chunks.length}]: ${(audioBuffer.length / 1024).toFixed(0)}KB — "${chunk.slice(0, 50)}"`);
|
||||||
if (sampleRate === 0) {
|
|
||||||
channels = audioBuffer.readUInt16LE(22);
|
sendToRVS({
|
||||||
sampleRate = audioBuffer.readUInt32LE(24);
|
type: "xtts_response",
|
||||||
bitsPerSample = audioBuffer.readUInt16LE(34);
|
payload: {
|
||||||
}
|
requestId: `${requestId || ""}_${i}`,
|
||||||
// PCM-Daten ab Byte 44
|
base64: audioBuffer.toString("base64"),
|
||||||
pcmBuffers.push(audioBuffer.slice(44));
|
mimeType: "audio/wav",
|
||||||
log(`TTS [${i + 1}/${chunks.length}]: ${audioBuffer.length} bytes — "${chunk.slice(0, 40)}..."`);
|
voice: voice || "default",
|
||||||
|
engine: "xtts",
|
||||||
|
part: i + 1,
|
||||||
|
totalParts: chunks.length,
|
||||||
|
},
|
||||||
|
timestamp: Date.now(),
|
||||||
|
});
|
||||||
|
sentCount++;
|
||||||
}
|
}
|
||||||
} catch (chunkErr) {
|
} catch (chunkErr) {
|
||||||
log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
|
log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pcmBuffers.length === 0) {
|
log(`TTS komplett: ${sentCount}/${chunks.length} Chunks gestreamt`);
|
||||||
log("TTS: Keine Audio-Daten erzeugt");
|
|
||||||
sendToRVS({
|
|
||||||
type: "xtts_response",
|
|
||||||
payload: { requestId, error: "Keine Audio-Daten" },
|
|
||||||
timestamp: Date.now(),
|
|
||||||
});
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// PCM-Daten zusammenfuegen und neuen WAV-Header schreiben
|
|
||||||
const allPcm = Buffer.concat(pcmBuffers);
|
|
||||||
const wavHeader = Buffer.alloc(44);
|
|
||||||
const byteRate = sampleRate * channels * (bitsPerSample / 8);
|
|
||||||
const blockAlign = channels * (bitsPerSample / 8);
|
|
||||||
|
|
||||||
wavHeader.write("RIFF", 0);
|
|
||||||
wavHeader.writeUInt32LE(36 + allPcm.length, 4);
|
|
||||||
wavHeader.write("WAVE", 8);
|
|
||||||
wavHeader.write("fmt ", 12);
|
|
||||||
wavHeader.writeUInt32LE(16, 16); // Subchunk1Size
|
|
||||||
wavHeader.writeUInt16LE(1, 20); // PCM format
|
|
||||||
wavHeader.writeUInt16LE(channels, 22);
|
|
||||||
wavHeader.writeUInt32LE(sampleRate, 24);
|
|
||||||
wavHeader.writeUInt32LE(byteRate, 28);
|
|
||||||
wavHeader.writeUInt16LE(blockAlign, 32);
|
|
||||||
wavHeader.writeUInt16LE(bitsPerSample, 34);
|
|
||||||
wavHeader.write("data", 36);
|
|
||||||
wavHeader.writeUInt32LE(allPcm.length, 40);
|
|
||||||
|
|
||||||
const completeWav = Buffer.concat([wavHeader, allPcm]);
|
|
||||||
const base64 = completeWav.toString("base64");
|
|
||||||
|
|
||||||
// In ~8 Sekunden Teile splitten (nahtlos genug fuer Queue, klein genug fuer WebSocket)
|
|
||||||
const samplesPerSec = sampleRate * channels * (bitsPerSample / 8);
|
|
||||||
const TARGET_SECS = 8; // ~8 Sekunden pro Teil
|
|
||||||
const targetBytes = samplesPerSec * TARGET_SECS;
|
|
||||||
|
|
||||||
const pcmParts = [];
|
|
||||||
for (let offset = 0; offset < allPcm.length; offset += targetBytes) {
|
|
||||||
pcmParts.push(allPcm.slice(offset, Math.min(offset + targetBytes, allPcm.length)));
|
|
||||||
}
|
|
||||||
|
|
||||||
function buildWav(pcmData) {
|
|
||||||
const header = Buffer.alloc(44);
|
|
||||||
header.write("RIFF", 0);
|
|
||||||
header.writeUInt32LE(36 + pcmData.length, 4);
|
|
||||||
header.write("WAVE", 8);
|
|
||||||
header.write("fmt ", 12);
|
|
||||||
header.writeUInt32LE(16, 16);
|
|
||||||
header.writeUInt16LE(1, 20);
|
|
||||||
header.writeUInt16LE(channels, 22);
|
|
||||||
header.writeUInt32LE(sampleRate, 24);
|
|
||||||
header.writeUInt32LE(byteRate, 28);
|
|
||||||
header.writeUInt16LE(blockAlign, 32);
|
|
||||||
header.writeUInt16LE(bitsPerSample, 34);
|
|
||||||
header.write("data", 36);
|
|
||||||
header.writeUInt32LE(pcmData.length, 40);
|
|
||||||
return Buffer.concat([header, pcmData]);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (let p = 0; p < pcmParts.length; p++) {
|
|
||||||
const partWav = buildWav(pcmParts[p]);
|
|
||||||
|
|
||||||
sendToRVS({
|
|
||||||
type: "xtts_response",
|
|
||||||
payload: {
|
|
||||||
requestId: `${requestId || ""}${pcmParts.length > 1 ? '_' + p : ''}`,
|
|
||||||
base64: partWav.toString("base64"),
|
|
||||||
mimeType: "audio/wav",
|
|
||||||
voice: voice || "default",
|
|
||||||
engine: "xtts",
|
|
||||||
part: p + 1,
|
|
||||||
totalParts: pcmParts.length,
|
|
||||||
},
|
|
||||||
timestamp: Date.now(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
const totalSecs = (allPcm.length / byteRate).toFixed(1);
|
|
||||||
log(`TTS komplett: ${chunks.length} Chunks → ${pcmParts.length} Teil(e), ${(allPcm.length / 1024).toFixed(0)}KB, ${totalSecs}s`);
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
log(`TTS Fehler: ${err.message}`);
|
log(`TTS Fehler: ${err.message}`);
|
||||||
sendToRVS({
|
sendToRVS({
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue