feat(phase2): XTTS durch F5-TTS ersetzt — Voice Cloning auf der Gamebox
Neuer aria-f5tts-bridge Container:
- Python-Service, laedt F5TTS_v1_Base beim Start
- Empfaengt xtts_request via RVS, synthetisiert mit Voice-Cloning,
streamt PCM-Chunks (audio_pcm, 16-bit s16le) wie zuvor die XTTS-Bridge
- Teilt lange Texte an Satzgrenzen, streamt satzweise
- Fade-In auf erstem Chunk, Queue gegen parallel-Render
Voice-Management:
- Speicherort weiterhin /voices/, aber jetzt als Paar
{name}.wav + {name}.txt (F5-TTS braucht Referenz-Transkription)
- voice_upload: WAV speichern, intern stt_request an whisper-bridge
senden, Transkription als .txt ablegen → user muss nichts eintippen
- On-the-fly Transkribierung: wenn eine WAV ohne .txt liegt, wird
bei erstem Render/Preload nachgezogen
- Bestehende RVS-Messages (voice_upload/xtts_list_voices/... etc.)
bleiben unveraendert → keine App/Diagnostic-Aenderung noetig
Gaming-PC docker-compose:
- xtts + xtts-bridge Services entfernt
- f5tts-bridge + whisper-bridge bleiben/kommen rein
- Volume xtts-models → f5tts-models
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e170991222
commit
576ae925dd
|
|
@ -1,5 +0,0 @@
|
|||
FROM node:22-alpine
|
||||
WORKDIR /app
|
||||
COPY bridge.js package.json ./
|
||||
RUN npm install --production
|
||||
CMD ["node", "bridge.js"]
|
||||
596
xtts/bridge.js
596
xtts/bridge.js
|
|
@ -1,596 +0,0 @@
|
|||
/**
|
||||
* ARIA XTTS Bridge — Verbindet XTTS v2 Server mit dem RVS
|
||||
*
|
||||
* Empfaengt tts_request ueber RVS → rendert Audio via XTTS API → sendet zurueck
|
||||
* Empfaengt voice_upload → speichert Voice-Sample fuer Cloning
|
||||
* Empfaengt xtts_list_voices → listet verfuegbare Stimmen
|
||||
*/
|
||||
|
||||
const WebSocket = require("ws");
|
||||
const http = require("http");
|
||||
const https = require("https");
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
||||
const XTTS_API_URL = process.env.XTTS_API_URL || "http://xtts:8000";
|
||||
const RVS_HOST = process.env.RVS_HOST || "";
|
||||
const RVS_PORT = process.env.RVS_PORT || "443";
|
||||
const RVS_TLS = process.env.RVS_TLS || "true";
|
||||
const RVS_TLS_FALLBACK = process.env.RVS_TLS_FALLBACK || "true";
|
||||
const RVS_TOKEN = process.env.RVS_TOKEN || "";
|
||||
const VOICES_DIR = "/voices";
|
||||
|
||||
function log(msg) {
|
||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
// ── RVS Verbindung ──────────────────────────────────
|
||||
|
||||
let rvsWs = null;
|
||||
let retryDelay = 2;
|
||||
|
||||
function connectRVS(forcePlain) {
|
||||
if (!RVS_HOST || !RVS_TOKEN) {
|
||||
log("RVS nicht konfiguriert — beende");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const useTls = RVS_TLS === "true" && !forcePlain;
|
||||
const proto = useTls ? "wss" : "ws";
|
||||
const url = `${proto}://${RVS_HOST}:${RVS_PORT}?token=${RVS_TOKEN}`;
|
||||
|
||||
log(`Verbinde zu RVS: ${proto}://${RVS_HOST}:${RVS_PORT}`);
|
||||
|
||||
const ws = new WebSocket(url);
|
||||
|
||||
ws.on("open", () => {
|
||||
log("RVS verbunden — warte auf TTS-Requests");
|
||||
rvsWs = ws;
|
||||
retryDelay = 2;
|
||||
|
||||
// Keepalive
|
||||
setInterval(() => {
|
||||
if (ws.readyState === WebSocket.OPEN) {
|
||||
ws.ping();
|
||||
ws.send(JSON.stringify({ type: "heartbeat", timestamp: Date.now() }));
|
||||
}
|
||||
}, 25000);
|
||||
});
|
||||
|
||||
ws.on("message", async (raw) => {
|
||||
try {
|
||||
const msg = JSON.parse(raw.toString());
|
||||
|
||||
if (msg.type === "xtts_request") {
|
||||
await handleTTSRequest(msg.payload);
|
||||
} else if (msg.type === "voice_upload") {
|
||||
await handleVoiceUpload(msg.payload);
|
||||
} else if (msg.type === "xtts_list_voices") {
|
||||
await handleListVoices();
|
||||
} else if (msg.type === "xtts_delete_voice") {
|
||||
await handleDeleteVoice(msg.payload);
|
||||
} else if (msg.type === "voice_preload") {
|
||||
await handleVoicePreload(msg.payload);
|
||||
} else if (msg.type === "config") {
|
||||
// Diagnostic hat globale Voice gewechselt → Preload damit der naechste
|
||||
// Render ohne Ladewartezeit startet + alle Clients "voice_ready" sehen
|
||||
const v = msg.payload && msg.payload.xttsVoice;
|
||||
if (v && v !== lastDiagnosticVoice) {
|
||||
lastDiagnosticVoice = v;
|
||||
await handleVoicePreload({ voice: v, source: "diagnostic" });
|
||||
} else if (!v) {
|
||||
lastDiagnosticVoice = "";
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
log(`Fehler: ${err.message}`);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on("close", () => {
|
||||
log("RVS Verbindung geschlossen");
|
||||
rvsWs = null;
|
||||
setTimeout(() => connectRVS(), Math.min(retryDelay * 1000, 30000));
|
||||
retryDelay = Math.min(retryDelay * 2, 30);
|
||||
});
|
||||
|
||||
ws.on("error", (err) => {
|
||||
log(`RVS Fehler: ${err.message}`);
|
||||
if (useTls && RVS_TLS_FALLBACK === "true") {
|
||||
log("TLS fehlgeschlagen — Fallback auf ws://");
|
||||
ws.removeAllListeners();
|
||||
try { ws.close(); } catch (_) {}
|
||||
connectRVS(true);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// ── TTS Request Handler ─────────────────────────────
|
||||
|
||||
/**
|
||||
* Linearer Fade-In auf einen base64-PCM-Chunk (s16le).
|
||||
* Mascht XTTS-Warmup-Glitches am Anfang eines Renders.
|
||||
*/
|
||||
function applyFadeIn(base64Pcm, sampleRate, channels, fadeMs) {
|
||||
const buf = Buffer.from(base64Pcm, "base64");
|
||||
const totalSamples = buf.length / 2; // s16le
|
||||
const fadeSamples = Math.min(
|
||||
Math.floor((fadeMs / 1000) * sampleRate) * channels,
|
||||
totalSamples
|
||||
);
|
||||
for (let i = 0; i < fadeSamples; i++) {
|
||||
const sample = buf.readInt16LE(i * 2);
|
||||
const gain = i / fadeSamples;
|
||||
buf.writeInt16LE(Math.round(sample * gain), i * 2);
|
||||
}
|
||||
return buf.toString("base64");
|
||||
}
|
||||
|
||||
// ── TTS-Queue ──────────────────────────────────────
|
||||
// XTTS verarbeitet Requests sequenziell, damit Streams sich nicht ueberlappen.
|
||||
// Ohne Queue wuerden parallele Requests parallel streamen → App bekommt
|
||||
// interleaved PCM-Chunks aus zwei Rendern → klingt wie Chaos.
|
||||
let ttsQueue = Promise.resolve();
|
||||
|
||||
// Merkt sich die letzte in Diagnostic gewaehlte Voice, damit wir nicht bei jedem
|
||||
// config-Broadcast (auch ohne Aenderung) einen Preload triggern.
|
||||
let lastDiagnosticVoice = "";
|
||||
|
||||
function handleTTSRequest(payload) {
|
||||
ttsQueue = ttsQueue.then(() => _runTTSRequest(payload)).catch(err => {
|
||||
log(`TTS-Queue Fehler: ${err.message}`);
|
||||
});
|
||||
return ttsQueue;
|
||||
}
|
||||
|
||||
async function _runTTSRequest(payload) {
|
||||
const { text, voice, requestId, language, messageId } = payload;
|
||||
if (!text) return;
|
||||
|
||||
// Markdown-Cleanup (Bridge macht jetzt auch Cleanup, aber safety net)
|
||||
let cleanText = text
|
||||
.replace(/\*\*([^*]+)\*\*/g, "$1")
|
||||
.replace(/\*([^*]+)\*/g, "$1")
|
||||
.replace(/`([^`]+)`/g, "$1")
|
||||
.replace(/```[\s\S]*?```/g, "")
|
||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
|
||||
.replace(/#{1,6}\s*/g, "")
|
||||
.replace(/>\s*/g, "")
|
||||
.replace(/[-*]\s+/g, "")
|
||||
.replace(/\n{2,}/g, ". ")
|
||||
.replace(/\n/g, ", ")
|
||||
.replace(/\s{2,}/g, " ")
|
||||
.replace(/["""„]/g, "")
|
||||
.replace(/\(\)/g, "")
|
||||
.trim();
|
||||
|
||||
log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);
|
||||
|
||||
try {
|
||||
// Im local-Mode erwartet daswer123 XTTS speaker_wav als Basename (ohne .wav,
|
||||
// ohne Pfad) — der Server prefixt EXAMPLE_FOLDER selbst. Wir checken hier
|
||||
// nur das physische File ab um Warnungen zu loggen; runter ans API geht
|
||||
// nur der Name.
|
||||
const voiceFilePath = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
||||
const hasCustomVoice = voiceFilePath && fs.existsSync(voiceFilePath);
|
||||
const speakerName = hasCustomVoice ? voice : "";
|
||||
if (voice && !hasCustomVoice) {
|
||||
log(`WARNUNG: Voice "${voice}" angefordert, aber ${voiceFilePath} existiert nicht — nehme Default`);
|
||||
} else if (hasCustomVoice) {
|
||||
log(`Voice "${voice}" verwendet (speaker_wav="${speakerName}")`);
|
||||
}
|
||||
|
||||
let chunkIndex = 0;
|
||||
let pcmMeta = null;
|
||||
let firstChunkSeen = false;
|
||||
|
||||
const onChunk = (pcmBase64, meta) => {
|
||||
if (!pcmMeta) pcmMeta = meta;
|
||||
let outBase64 = pcmBase64;
|
||||
// Fade-In auf den ersten Chunk — maskiert XTTS-Warmup-Glitches
|
||||
// (autoregressiver Generator hat am Anfang wenig Kontext → Artefakte).
|
||||
if (!firstChunkSeen && pcmBase64) {
|
||||
firstChunkSeen = true;
|
||||
outBase64 = applyFadeIn(pcmBase64, meta.sampleRate, meta.channels, 120);
|
||||
}
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: outBase64,
|
||||
format: "pcm_s16le",
|
||||
sampleRate: meta.sampleRate,
|
||||
channels: meta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: false,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
};
|
||||
|
||||
// /tts_stream fuer echtes Streaming (funktioniert im XTTS local-Mode).
|
||||
// Wenn Server im apiManual/api-Mode laeuft: 400 → Fallback auf /tts_to_audio/.
|
||||
try {
|
||||
await streamXTTSAsPCM(
|
||||
cleanText,
|
||||
language || "de",
|
||||
speakerName,
|
||||
onChunk,
|
||||
);
|
||||
} catch (streamErr) {
|
||||
log(`/tts_stream fehlgeschlagen (${streamErr.message.slice(0, 100)}) — Fallback /tts_to_audio/`);
|
||||
await streamXTTSBatch(
|
||||
cleanText,
|
||||
language || "de",
|
||||
speakerName,
|
||||
onChunk,
|
||||
);
|
||||
}
|
||||
|
||||
// Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
|
||||
if (pcmMeta) {
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: "",
|
||||
format: "pcm_s16le",
|
||||
sampleRate: pcmMeta.sampleRate,
|
||||
channels: pcmMeta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: true,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
}
|
||||
|
||||
log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);
|
||||
} catch (err) {
|
||||
log(`TTS Fehler: ${err.message}`);
|
||||
sendToRVS({
|
||||
type: "xtts_response",
|
||||
payload: { requestId, error: err.message },
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Ruft /tts_stream auf — echter Streaming-Endpoint bei daswer123.
|
||||
* Schickt was der Server verlangt (allow: GET), aber mit JSON-Body
|
||||
* als POST scheitert mit 405. Manche Versionen wollen GET + Query,
|
||||
* andere POST + JSON. Testen was funktioniert.
|
||||
*/
|
||||
function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
|
||||
return new Promise((resolve, reject) => {
|
||||
// Wichtig: speaker_wav MUSS als Query-Key dabei sein (Pydantic required) —
|
||||
// auch bei default-voice mit leerem Wert. Sonst gibt's HTTP 422.
|
||||
// stream_chunk_size=250: grosse Chunks = wenige Chunk-Grenzen = wenig
|
||||
// Render-Artefakte. daswer123 erzeugt an Chunk-Boundaries haeufig Glitches
|
||||
// in den Worten die ueber die Grenze gehen. Hoehere Latenz ist OK.
|
||||
const qs = new URLSearchParams();
|
||||
qs.set("text", text);
|
||||
qs.set("language", language || "de");
|
||||
qs.set("speaker_wav", speakerWav || "");
|
||||
qs.set("stream_chunk_size", "250");
|
||||
|
||||
const url = new URL(XTTS_API_URL);
|
||||
const fullPath = `/tts_stream?${qs.toString()}`;
|
||||
const options = {
|
||||
hostname: url.hostname,
|
||||
port: url.port || 80,
|
||||
path: fullPath,
|
||||
method: "GET",
|
||||
timeout: 60000,
|
||||
};
|
||||
|
||||
log(`TTS GET /tts_stream?text=${text.slice(0, 30)}... (voice=${speakerWav ? "custom" : "default"})`);
|
||||
|
||||
const req = http.request(options, (res) => {
|
||||
if (res.statusCode !== 200) {
|
||||
let body = "";
|
||||
res.on("data", (d) => { body += d.toString(); });
|
||||
res.on("end", () => {
|
||||
log(`XTTS /tts_stream ${res.statusCode}: ${body.slice(0, 300)}`);
|
||||
reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`));
|
||||
});
|
||||
return;
|
||||
}
|
||||
log(`TTS stream verbunden, empfange PCM...`);
|
||||
|
||||
let headerParsed = false;
|
||||
let sampleRate = 24000;
|
||||
let channels = 1;
|
||||
let leftover = Buffer.alloc(0); // ungerade Byte-Reste fuer das naechste Chunk
|
||||
const HEADER_BYTES = 44;
|
||||
let headerBuf = Buffer.alloc(0);
|
||||
const PCM_CHUNK_BYTES = 8192; // ~170ms bei 24kHz s16 mono
|
||||
|
||||
res.on("data", (chunk) => {
|
||||
let data = chunk;
|
||||
|
||||
// WAV-Header konsumieren (44 Bytes)
|
||||
if (!headerParsed) {
|
||||
headerBuf = Buffer.concat([headerBuf, data]);
|
||||
if (headerBuf.length < HEADER_BYTES) return;
|
||||
// Header lesen
|
||||
const header = headerBuf.slice(0, HEADER_BYTES);
|
||||
try {
|
||||
channels = header.readUInt16LE(22);
|
||||
sampleRate = header.readUInt32LE(24);
|
||||
} catch (_) {}
|
||||
headerParsed = true;
|
||||
data = headerBuf.slice(HEADER_BYTES);
|
||||
}
|
||||
|
||||
// leftover aus vorherigem Chunk + neuer data
|
||||
let combined = Buffer.concat([leftover, data]);
|
||||
|
||||
// In PCM_CHUNK_BYTES-Happen zerlegen (Vielfache von 2 damit keine Sample-Splits)
|
||||
while (combined.length >= PCM_CHUNK_BYTES) {
|
||||
const slice = combined.slice(0, PCM_CHUNK_BYTES);
|
||||
combined = combined.slice(PCM_CHUNK_BYTES);
|
||||
onPcmChunk(slice.toString("base64"), { sampleRate, channels });
|
||||
}
|
||||
leftover = combined;
|
||||
});
|
||||
|
||||
res.on("end", () => {
|
||||
// Rest-Daten senden
|
||||
if (leftover.length > 0) {
|
||||
onPcmChunk(leftover.toString("base64"), { sampleRate, channels });
|
||||
}
|
||||
resolve();
|
||||
});
|
||||
|
||||
res.on("error", reject);
|
||||
});
|
||||
|
||||
req.on("error", reject);
|
||||
req.on("timeout", () => { req.destroy(); reject(new Error("XTTS API Timeout (60s)")); });
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallback: /tts_to_audio/ (POST JSON) — rendert komplett, dann response.
|
||||
* Kein echtes Streaming, aber stabil als Backup wenn /tts_stream nicht geht.
|
||||
* Shared chunking-Logik mit streamXTTSAsPCM — parst WAV-Header, stueckelt PCM.
|
||||
*/
|
||||
function streamXTTSBatch(text, language, speakerWav, onPcmChunk) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const body = JSON.stringify({
|
||||
text,
|
||||
language: language || "de",
|
||||
speaker_wav: speakerWav || "",
|
||||
});
|
||||
const url = new URL(XTTS_API_URL);
|
||||
const options = {
|
||||
hostname: url.hostname,
|
||||
port: url.port || 80,
|
||||
path: "/tts_to_audio/",
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"Content-Length": Buffer.byteLength(body),
|
||||
},
|
||||
timeout: 60000,
|
||||
};
|
||||
|
||||
const req = http.request(options, (res) => {
|
||||
if (res.statusCode !== 200) {
|
||||
let rb = "";
|
||||
res.on("data", (d) => { rb += d.toString(); });
|
||||
res.on("end", () => reject(new Error(`XTTS Batch HTTP ${res.statusCode}: ${rb.slice(0, 200)}`)));
|
||||
return;
|
||||
}
|
||||
let headerParsed = false;
|
||||
let sampleRate = 24000;
|
||||
let channels = 1;
|
||||
let leftover = Buffer.alloc(0);
|
||||
let headerBuf = Buffer.alloc(0);
|
||||
const HEADER_BYTES = 44;
|
||||
const PCM_CHUNK_BYTES = 8192;
|
||||
|
||||
res.on("data", (chunk) => {
|
||||
let data = chunk;
|
||||
if (!headerParsed) {
|
||||
headerBuf = Buffer.concat([headerBuf, data]);
|
||||
if (headerBuf.length < HEADER_BYTES) return;
|
||||
const header = headerBuf.slice(0, HEADER_BYTES);
|
||||
try { channels = header.readUInt16LE(22); sampleRate = header.readUInt32LE(24); } catch (_) {}
|
||||
headerParsed = true;
|
||||
data = headerBuf.slice(HEADER_BYTES);
|
||||
}
|
||||
let combined = Buffer.concat([leftover, data]);
|
||||
while (combined.length >= PCM_CHUNK_BYTES) {
|
||||
const slice = combined.slice(0, PCM_CHUNK_BYTES);
|
||||
combined = combined.slice(PCM_CHUNK_BYTES);
|
||||
onPcmChunk(slice.toString("base64"), { sampleRate, channels });
|
||||
}
|
||||
leftover = combined;
|
||||
});
|
||||
res.on("end", () => {
|
||||
if (leftover.length > 0) onPcmChunk(leftover.toString("base64"), { sampleRate, channels });
|
||||
resolve();
|
||||
});
|
||||
res.on("error", reject);
|
||||
});
|
||||
req.on("error", reject);
|
||||
req.on("timeout", () => { req.destroy(); reject(new Error("XTTS Batch Timeout (60s)")); });
|
||||
req.write(body);
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
// ── Voice Upload Handler ────────────────────────────
|
||||
|
||||
async function handleVoiceUpload(payload) {
|
||||
const { name, samples } = payload;
|
||||
if (!name || !samples || !Array.isArray(samples) || samples.length === 0) {
|
||||
log("Voice Upload: Ungueltige Daten");
|
||||
return;
|
||||
}
|
||||
|
||||
log(`Voice Upload: "${name}" (${samples.length} Samples)`);
|
||||
|
||||
try {
|
||||
// Alle Samples zusammenfuegen
|
||||
const buffers = samples.map(s => Buffer.from(s.base64, "base64"));
|
||||
const combined = Buffer.concat(buffers);
|
||||
|
||||
// Als WAV speichern
|
||||
fs.mkdirSync(VOICES_DIR, { recursive: true });
|
||||
const filePath = path.join(VOICES_DIR, `${name.replace(/[^a-zA-Z0-9_-]/g, "_")}.wav`);
|
||||
fs.writeFileSync(filePath, combined);
|
||||
|
||||
log(`Voice gespeichert: ${filePath} (${(combined.length / 1024).toFixed(0)}KB)`);
|
||||
|
||||
sendToRVS({
|
||||
type: "xtts_voice_saved",
|
||||
payload: { name, size: combined.length, path: filePath },
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
} catch (err) {
|
||||
log(`Voice Upload Fehler: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Voice Delete Handler ────────────────────────────
|
||||
|
||||
async function handleDeleteVoice(payload) {
|
||||
const { name } = payload || {};
|
||||
if (!name || typeof name !== "string") {
|
||||
log("Voice Delete: ungueltiger Name");
|
||||
return;
|
||||
}
|
||||
const safe = name.replace(/[^a-zA-Z0-9_-]/g, "_");
|
||||
const filePath = path.join(VOICES_DIR, `${safe}.wav`);
|
||||
try {
|
||||
if (fs.existsSync(filePath)) {
|
||||
fs.unlinkSync(filePath);
|
||||
log(`Voice geloescht: ${filePath}`);
|
||||
} else {
|
||||
log(`Voice Delete: Datei existiert nicht (${filePath})`);
|
||||
}
|
||||
// Aktualisierte Liste an alle Clients senden
|
||||
await handleListVoices();
|
||||
} catch (err) {
|
||||
log(`Voice Delete Fehler: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Voice List Handler ──────────────────────────────
|
||||
|
||||
/**
|
||||
* Preload einer Stimme — rendert stumm ein kurzes Dummy-Audio, damit XTTS
|
||||
* die Speaker-Latents laedt und der naechste echte Request ohne Wartezeit
|
||||
* loslegen kann. Broadcastet "voice_ready" wenn fertig (oder mit error).
|
||||
*/
|
||||
async function handleVoicePreload(payload) {
|
||||
const voice = (payload && payload.voice) || "";
|
||||
const source = (payload && payload.source) || "unknown";
|
||||
const requestId = (payload && payload.requestId) || "";
|
||||
log(`Voice-Preload angefordert: "${voice}" (source=${source})`);
|
||||
|
||||
try {
|
||||
let speakerName = "";
|
||||
if (voice) {
|
||||
const voiceFilePath = path.join(VOICES_DIR, `${voice}.wav`);
|
||||
if (!fs.existsSync(voiceFilePath)) {
|
||||
sendToRVS({
|
||||
type: "voice_ready",
|
||||
payload: { voice, requestId, error: "voice-file-not-found" },
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
log(`Preload abgebrochen: ${voiceFilePath} existiert nicht`);
|
||||
return;
|
||||
}
|
||||
speakerName = voice;
|
||||
}
|
||||
|
||||
// Dummy-Request via Queue — damit sich Preload nicht mit echtem TTS ueberholt.
|
||||
const t0 = Date.now();
|
||||
await new Promise((resolve, reject) => {
|
||||
ttsQueue = ttsQueue.then(async () => {
|
||||
try {
|
||||
await streamXTTSAsPCM("ja.", "de", speakerName, () => {});
|
||||
resolve();
|
||||
} catch (err) {
|
||||
reject(err);
|
||||
}
|
||||
}).catch(reject);
|
||||
});
|
||||
const ms = Date.now() - t0;
|
||||
log(`Voice "${voice || "default"}" geladen in ${ms}ms`);
|
||||
|
||||
sendToRVS({
|
||||
type: "voice_ready",
|
||||
payload: { voice, requestId, loadMs: ms },
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
} catch (err) {
|
||||
log(`Voice-Preload Fehler: ${err.message}`);
|
||||
sendToRVS({
|
||||
type: "voice_ready",
|
||||
payload: { voice, requestId, error: err.message.slice(0, 200) },
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async function handleListVoices() {
|
||||
try {
|
||||
const files = fs.existsSync(VOICES_DIR)
|
||||
? fs.readdirSync(VOICES_DIR).filter(f => f.endsWith(".wav"))
|
||||
: [];
|
||||
|
||||
const voices = files.map(f => ({
|
||||
name: path.basename(f, ".wav"),
|
||||
file: f,
|
||||
size: fs.statSync(path.join(VOICES_DIR, f)).size,
|
||||
}));
|
||||
|
||||
log(`Stimmen: ${voices.length} verfuegbar`);
|
||||
|
||||
sendToRVS({
|
||||
type: "xtts_voices_list",
|
||||
payload: { voices },
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
} catch (err) {
|
||||
log(`Stimmen-Liste Fehler: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ── RVS senden ──────────────────────────────────────
|
||||
|
||||
function sendToRVS(msg) {
|
||||
if (rvsWs && rvsWs.readyState === WebSocket.OPEN) {
|
||||
rvsWs.send(JSON.stringify(msg));
|
||||
}
|
||||
}
|
||||
|
||||
// ── Start ───────────────────────────────────────────
|
||||
|
||||
log("ARIA XTTS Bridge startet...");
|
||||
log(`XTTS API: ${XTTS_API_URL}`);
|
||||
log(`RVS: ${RVS_HOST}:${RVS_PORT}`);
|
||||
|
||||
// Warten bis XTTS API erreichbar ist
|
||||
function waitForXTTS(callback, attempts) {
|
||||
if (attempts <= 0) { log("XTTS API nicht erreichbar — starte trotzdem"); callback(); return; }
|
||||
http.get(`${XTTS_API_URL}/docs`, (res) => {
|
||||
log(`XTTS API erreichbar (HTTP ${res.statusCode})`);
|
||||
callback();
|
||||
}).on("error", () => {
|
||||
log(`XTTS API noch nicht bereit — warte (${attempts} Versuche uebrig)...`);
|
||||
setTimeout(() => waitForXTTS(callback, attempts - 1), 10000); // 10s statt 5s (Model laden dauert)
|
||||
});
|
||||
}
|
||||
|
||||
waitForXTTS(() => connectRVS(), 30); // Max 5min warten
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
# ════════════════════════════════════════════════
|
||||
# ARIA XTTS v2 — GPU TTS Server
|
||||
# ARIA Gamebox Stack — GPU F5-TTS + Whisper STT
|
||||
# Laeuft auf dem Gaming-PC (RTX 3060)
|
||||
# Verbindet sich zum RVS fuer TTS-Requests
|
||||
# Verbindet sich zum RVS fuer TTS/STT-Requests
|
||||
# ════════════════════════════════════════════════
|
||||
#
|
||||
# Voraussetzungen:
|
||||
|
|
@ -10,15 +10,18 @@
|
|||
# - .env mit RVS-Verbindungsdaten
|
||||
#
|
||||
# Start: docker compose up -d
|
||||
# Test: curl http://localhost:8000/docs
|
||||
# ════════════════════════════════════════════════
|
||||
|
||||
services:
|
||||
|
||||
# ─── XTTS v2 API Server (GPU) ─────────────────
|
||||
xtts:
|
||||
image: daswer123/xtts-api-server:latest
|
||||
container_name: aria-xtts
|
||||
# ─── F5-TTS Bridge (GPU) ──────────────────────
|
||||
# Ersetzt den frueheren XTTS-Stack. Empfaengt xtts_request via RVS,
|
||||
# rendert via F5-TTS mit Voice-Cloning, streamt PCM an die App.
|
||||
# Voice-Upload: speichert WAV und laesst whisper-bridge den Referenz-
|
||||
# text transkribieren — der User muss nichts eintippen.
|
||||
f5tts-bridge:
|
||||
build: ./f5tts
|
||||
container_name: aria-f5tts-bridge
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
|
|
@ -26,45 +29,29 @@ services:
|
|||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
ports:
|
||||
- "8000:8020"
|
||||
volumes:
|
||||
- xtts-models:/app/xtts_models # Model-Cache (~2GB)
|
||||
- ./voices:/voices # Custom Voice Samples
|
||||
- ./voices:/voices # WAV + TXT Referenz
|
||||
- f5tts-models:/root/.cache/huggingface # Model-Cache persistieren
|
||||
environment:
|
||||
- COQUI_TOS_AGREED=1
|
||||
# Local-Modus statt default "apiManual": Modell bleibt im GPU-VRAM,
|
||||
# Render startet sofort, /tts_stream funktioniert.
|
||||
# Default-CMD des Images liest diese ENV: -ms ${MODEL_SOURCE:-"apiManual"}
|
||||
- MODEL_SOURCE=local
|
||||
# Speaker-Folder auf unsere gemounteten voices zeigen lassen
|
||||
- EXAMPLE_FOLDER=/voices
|
||||
restart: unless-stopped
|
||||
|
||||
# ─── XTTS Bridge (verbindet zu RVS) ───────────
|
||||
xtts-bridge:
|
||||
build: .
|
||||
container_name: aria-xtts-bridge
|
||||
depends_on:
|
||||
- xtts
|
||||
volumes:
|
||||
- ./voices:/voices # Shared mit XTTS-Server
|
||||
environment:
|
||||
- XTTS_API_URL=http://xtts:8020
|
||||
- RVS_HOST=${RVS_HOST}
|
||||
- RVS_PORT=${RVS_PORT:-443}
|
||||
- RVS_TLS=${RVS_TLS:-true}
|
||||
- RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true}
|
||||
- RVS_TOKEN=${RVS_TOKEN}
|
||||
- F5TTS_MODEL=${F5TTS_MODEL:-F5TTS_v1_Base}
|
||||
- F5TTS_DEVICE=${F5TTS_DEVICE:-cuda}
|
||||
- VOICES_DIR=/voices
|
||||
restart: unless-stopped
|
||||
|
||||
# ─── Whisper STT (GPU) ────────────────────────
|
||||
# Faster-Whisper auf der Gamebox statt auf der VM (CPU) —
|
||||
# deutlich schneller. Verbindet sich selbst per WebSocket an
|
||||
# den RVS und nimmt dort stt_request Nachrichten der aria-bridge
|
||||
# entgegen, antwortet mit stt_response. Laedt das Modell beim
|
||||
# Start vor; auf Config-Broadcasts (Diagnostic → whisperModel)
|
||||
# wird zur Laufzeit hot-swapped.
|
||||
# entgegen, antwortet mit stt_response. Zusaetzlich nutzt die
|
||||
# f5tts-bridge Whisper intern fuer die Referenz-Transkription bei
|
||||
# Voice-Uploads. Laedt das Modell beim Start vor; auf Config-
|
||||
# Broadcasts (Diagnostic → whisperModel) wird zur Laufzeit hot-
|
||||
# swapped.
|
||||
whisper-bridge:
|
||||
build: ./whisper
|
||||
container_name: aria-whisper-bridge
|
||||
|
|
@ -86,9 +73,9 @@ services:
|
|||
- WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
|
||||
- WHISPER_LANGUAGE=${WHISPER_LANGUAGE:-de}
|
||||
volumes:
|
||||
- whisper-models:/root/.cache/huggingface # Model-Cache persistieren
|
||||
- whisper-models:/root/.cache/huggingface
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
xtts-models:
|
||||
f5tts-models:
|
||||
whisper-models:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,21 @@
|
|||
FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip ffmpeg git \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# PyTorch CUDA-Wheels zuerst (f5-tts zieht sonst CPU-only Torch rein)
|
||||
RUN pip3 install --no-cache-dir torch==2.3.1 torchaudio==2.3.1 \
|
||||
--index-url https://download.pytorch.org/whl/cu121
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY bridge.py .
|
||||
|
||||
CMD ["python3", "bridge.py"]
|
||||
|
|
@ -0,0 +1,580 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
ARIA F5-TTS Bridge — laeuft auf der Gamebox (RTX 3060).
|
||||
|
||||
Empfaengt xtts_request via RVS → F5-TTS Voice Cloning auf GPU → streamt
|
||||
16-bit PCM Chunks als audio_pcm Nachrichten zurueck an die App.
|
||||
|
||||
Voice-Layout im VOICES_DIR:
|
||||
{name}.wav — Referenz-Audio (6-10s, 24kHz mono empfohlen)
|
||||
{name}.txt — Referenz-Text (UTF-8, was im WAV gesprochen wird)
|
||||
|
||||
Beim voice_upload senden wir intern einen stt_request an die whisper-bridge
|
||||
und legen die Transkription als .txt ab — der User muss keinen Text eingeben.
|
||||
|
||||
Env:
|
||||
RVS_HOST, RVS_PORT, RVS_TLS, RVS_TLS_FALLBACK, RVS_TOKEN
|
||||
F5TTS_MODEL Default: F5TTS_v1_Base
|
||||
F5TTS_DEVICE Default: cuda
|
||||
VOICES_DIR Default: /voices
|
||||
"""
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import websockets
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger("f5tts-bridge")
|
||||
# HuggingFace + Torch download-Logs etwas daempfen
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
||||
|
||||
RVS_HOST = os.getenv("RVS_HOST", "").strip()
|
||||
RVS_PORT = int(os.getenv("RVS_PORT", "443"))
|
||||
RVS_TLS = os.getenv("RVS_TLS", "true").lower() == "true"
|
||||
RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true").lower() == "true"
|
||||
RVS_TOKEN = os.getenv("RVS_TOKEN", "").strip()
|
||||
|
||||
F5TTS_MODEL = os.getenv("F5TTS_MODEL", "F5TTS_v1_Base")
|
||||
F5TTS_DEVICE = os.getenv("F5TTS_DEVICE", "cuda")
|
||||
VOICES_DIR = Path(os.getenv("VOICES_DIR", "/voices"))
|
||||
|
||||
PCM_CHUNK_BYTES = 8192 # ~170ms @ 24kHz mono s16
|
||||
TARGET_SR = 24000 # F5-TTS native
|
||||
|
||||
# ── Lazy F5-TTS Loader ──────────────────────────────────────
|
||||
|
||||
_F5TTS_cls = None
|
||||
|
||||
|
||||
def _get_f5tts_cls():
|
||||
"""Lazy import damit Startup-Logs nicht durch Torch-Warnungen zumuellen."""
|
||||
global _F5TTS_cls
|
||||
if _F5TTS_cls is None:
|
||||
from f5_tts.api import F5TTS as _cls
|
||||
_F5TTS_cls = _cls
|
||||
return _F5TTS_cls
|
||||
|
||||
|
||||
class F5Runner:
|
||||
"""Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking)."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.model = None
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
def _load_blocking(self) -> None:
|
||||
cls = _get_f5tts_cls()
|
||||
logger.info("Lade F5-TTS '%s' (device=%s)...", F5TTS_MODEL, F5TTS_DEVICE)
|
||||
t0 = time.time()
|
||||
self.model = cls(model=F5TTS_MODEL, device=F5TTS_DEVICE)
|
||||
logger.info("F5-TTS geladen in %.1fs", time.time() - t0)
|
||||
|
||||
async def ensure_loaded(self) -> None:
|
||||
async with self._lock:
|
||||
if self.model is not None:
|
||||
return
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, self._load_blocking)
|
||||
|
||||
def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
|
||||
wav, sr, _ = self.model.infer(
|
||||
ref_file=ref_wav,
|
||||
ref_text=ref_text,
|
||||
gen_text=gen_text,
|
||||
remove_silence=True,
|
||||
seed=-1,
|
||||
)
|
||||
# F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard
|
||||
if not isinstance(wav, np.ndarray):
|
||||
wav = np.asarray(wav, dtype=np.float32)
|
||||
if wav.ndim > 1:
|
||||
wav = wav.squeeze()
|
||||
return wav.astype(np.float32), int(sr)
|
||||
|
||||
async def synthesize(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
|
||||
await self.ensure_loaded()
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(None, self._infer_blocking, gen_text, ref_wav, ref_text)
|
||||
|
||||
|
||||
# ── Helpers ─────────────────────────────────────────────────
|
||||
|
||||
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+")
|
||||
|
||||
|
||||
def split_sentences(text: str, max_len: int = 350) -> list[str]:
|
||||
"""Teilt langen Text an Satzgrenzen. Kurze Texte bleiben als-is."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
if len(text) <= max_len:
|
||||
return [text]
|
||||
parts = [p.strip() for p in _SENTENCE_SPLIT.split(text) if p.strip()]
|
||||
# Zu kurze Fragmente mergen damit F5-TTS nicht an jedem Komma neu startet
|
||||
merged: list[str] = []
|
||||
buf = ""
|
||||
for p in parts:
|
||||
if len(buf) + len(p) + 1 <= max_len:
|
||||
buf = f"{buf} {p}".strip()
|
||||
else:
|
||||
if buf:
|
||||
merged.append(buf)
|
||||
buf = p
|
||||
if buf:
|
||||
merged.append(buf)
|
||||
return merged or [text]
|
||||
|
||||
|
||||
def float_to_pcm16(wav: np.ndarray) -> bytes:
|
||||
"""Float32 (-1..+1) → int16 little-endian bytes."""
|
||||
wav = np.clip(wav, -1.0, 1.0)
|
||||
pcm = (wav * 32767.0).astype(np.int16)
|
||||
return pcm.tobytes()
|
||||
|
||||
|
||||
def sanitize_voice_name(name: str) -> str:
|
||||
return re.sub(r"[^a-zA-Z0-9_-]", "_", name)
|
||||
|
||||
|
||||
def voice_paths(name: str) -> tuple[Path, Path]:
|
||||
safe = sanitize_voice_name(name)
|
||||
return VOICES_DIR / f"{safe}.wav", VOICES_DIR / f"{safe}.txt"
|
||||
|
||||
|
||||
def ensure_24k_mono_wav(src_wav: Path) -> Path:
|
||||
"""F5-TTS moechte 24kHz mono als Referenz — ffmpeg konvertiert inplace.
|
||||
|
||||
Wenn das File schon passt, wird nichts geaendert. Sonst wird es
|
||||
reingeschrieben (Original wird ueberschrieben).
|
||||
"""
|
||||
try:
|
||||
info = sf.info(str(src_wav))
|
||||
if info.samplerate == TARGET_SR and info.channels == 1:
|
||||
return src_wav
|
||||
except Exception:
|
||||
pass
|
||||
tmp_out = src_wav.with_suffix(".conv.wav")
|
||||
cmd = ["ffmpeg", "-y", "-i", str(src_wav),
|
||||
"-ar", str(TARGET_SR), "-ac", "1", "-f", "wav", str(tmp_out)]
|
||||
r = subprocess.run(cmd, capture_output=True, timeout=30)
|
||||
if r.returncode != 0:
|
||||
logger.warning("ffmpeg-Konvertierung von %s fehlgeschlagen: %s",
|
||||
src_wav, r.stderr.decode(errors="replace")[:200])
|
||||
try:
|
||||
tmp_out.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
return src_wav
|
||||
os.replace(tmp_out, src_wav)
|
||||
return src_wav
|
||||
|
||||
|
||||
async def _send(ws, mtype: str, payload: dict) -> None:
|
||||
try:
|
||||
await ws.send(json.dumps({
|
||||
"type": mtype,
|
||||
"payload": payload,
|
||||
"timestamp": int(time.time() * 1000),
|
||||
}))
|
||||
except Exception as e:
|
||||
logger.warning("Send fehlgeschlagen (%s): %s", mtype, e)
|
||||
|
||||
|
||||
# ── Interne Transkription via whisper-bridge ────────────────
|
||||
|
||||
_pending_stt: dict[str, asyncio.Future] = {}
|
||||
_STT_TIMEOUT_S = 60.0
|
||||
|
||||
|
||||
async def request_transcription(ws, wav_path: Path, language: str = "de") -> Optional[str]:
|
||||
"""Sendet einen stt_request an die whisper-bridge (ueber RVS) und wartet auf stt_response."""
|
||||
try:
|
||||
with open(wav_path, "rb") as f:
|
||||
audio_b64 = base64.b64encode(f.read()).decode("ascii")
|
||||
except Exception as e:
|
||||
logger.error("Lesen %s fehlgeschlagen: %s", wav_path, e)
|
||||
return None
|
||||
|
||||
request_id = str(uuid.uuid4())
|
||||
loop = asyncio.get_event_loop()
|
||||
fut: asyncio.Future = loop.create_future()
|
||||
_pending_stt[request_id] = fut
|
||||
|
||||
try:
|
||||
await _send(ws, "stt_request", {
|
||||
"requestId": request_id,
|
||||
"audio": audio_b64,
|
||||
"mimeType": "audio/wav",
|
||||
"model": "small", # klein reicht fuer Voice-Referenz
|
||||
"language": language,
|
||||
})
|
||||
return await asyncio.wait_for(fut, timeout=_STT_TIMEOUT_S)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Transkription Timeout fuer %s", wav_path.name)
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("Transkription Fehler: %s", e)
|
||||
return None
|
||||
finally:
|
||||
_pending_stt.pop(request_id, None)
|
||||
|
||||
|
||||
# ── TTS-Request Handler ─────────────────────────────────────
|
||||
|
||||
# Queue damit sich parallele Requests nicht ueberlappen (GPU-Throughput)
|
||||
_tts_queue: asyncio.Queue[tuple] = asyncio.Queue()
|
||||
|
||||
|
||||
async def _tts_worker(ws, runner: F5Runner) -> None:
|
||||
"""Serialisiert Synthesen — GPU kann sonst OOM gehen."""
|
||||
while True:
|
||||
text, voice, request_id, message_id, language = await _tts_queue.get()
|
||||
try:
|
||||
await _do_tts(ws, runner, text, voice, request_id, message_id, language)
|
||||
except Exception:
|
||||
logger.exception("TTS-Worker Fehler")
|
||||
finally:
|
||||
_tts_queue.task_done()
|
||||
|
||||
|
||||
async def _do_tts(ws, runner: F5Runner, text: str, voice: str,
|
||||
request_id: str, message_id: str, language: str) -> None:
|
||||
t0 = time.time()
|
||||
ref_wav_path, ref_txt_path = voice_paths(voice) if voice else (None, None)
|
||||
has_custom = bool(voice and ref_wav_path and ref_wav_path.exists() and ref_txt_path.exists())
|
||||
if voice and not has_custom:
|
||||
# Wenn nur WAV da ist aber kein txt → on-the-fly transkribieren
|
||||
if ref_wav_path and ref_wav_path.exists() and (not ref_txt_path or not ref_txt_path.exists()):
|
||||
logger.info("Voice '%s' hat kein txt — transkribiere on-the-fly", voice)
|
||||
text_ref = await request_transcription(ws, ref_wav_path, language)
|
||||
if text_ref:
|
||||
try:
|
||||
ref_txt_path.write_text(text_ref.strip(), encoding="utf-8")
|
||||
has_custom = True
|
||||
logger.info("Referenz-Text nachgezogen: '%s'", text_ref[:60])
|
||||
except Exception as e:
|
||||
logger.warning("Referenz-Text speichern fehlgeschlagen: %s", e)
|
||||
if not has_custom:
|
||||
logger.warning("Voice '%s' nicht komplett (%s, txt=%s) — nehme Default",
|
||||
voice, ref_wav_path, (ref_txt_path and ref_txt_path.exists()))
|
||||
|
||||
if has_custom:
|
||||
ref_wav_str = str(ref_wav_path)
|
||||
ref_text = ref_txt_path.read_text(encoding="utf-8").strip()
|
||||
else:
|
||||
# Fallback: kein Custom-Voice. F5-TTS braucht IMMER eine Referenz,
|
||||
# wir nehmen default_ref.wav/txt falls vorhanden, sonst die erste
|
||||
# gefundene Voice im Ordner.
|
||||
default_wav = VOICES_DIR / "default_ref.wav"
|
||||
default_txt = VOICES_DIR / "default_ref.txt"
|
||||
if default_wav.exists() and default_txt.exists():
|
||||
ref_wav_str = str(default_wav)
|
||||
ref_text = default_txt.read_text(encoding="utf-8").strip()
|
||||
else:
|
||||
# Nimm irgendein vorhandenes voice-Paar
|
||||
pair = next(
|
||||
((w, t) for w, t in (
|
||||
(v, v.with_suffix(".txt")) for v in VOICES_DIR.glob("*.wav")
|
||||
) if t.exists()),
|
||||
None,
|
||||
)
|
||||
if not pair:
|
||||
logger.error("Keine Referenz-Stimme im VOICES_DIR — TTS abgebrochen")
|
||||
return
|
||||
ref_wav_str, ref_text = str(pair[0]), pair[1].read_text(encoding="utf-8").strip()
|
||||
|
||||
sentences = split_sentences(text)
|
||||
logger.info("F5-TTS: %d Satz(e), voice=%s (%s)", len(sentences), voice or "default", ref_wav_str)
|
||||
|
||||
chunk_index = 0
|
||||
pcm_sr = TARGET_SR
|
||||
for i, sent in enumerate(sentences):
|
||||
try:
|
||||
wav, sr = await runner.synthesize(sent, ref_wav_str, ref_text)
|
||||
pcm_sr = sr
|
||||
pcm_bytes = float_to_pcm16(wav)
|
||||
# Erste PCM-Chunk des allerersten Satzes bekommt Fade-In (maskiert
|
||||
# eventuelle Warmup-Glitches). Alle anderen Chunks bleiben wie sind.
|
||||
if i == 0 and chunk_index == 0:
|
||||
pcm_bytes = _fade_in_pcm16(pcm_bytes, sr, 120)
|
||||
|
||||
# Stueckeln
|
||||
for off in range(0, len(pcm_bytes), PCM_CHUNK_BYTES):
|
||||
slice_ = pcm_bytes[off:off + PCM_CHUNK_BYTES]
|
||||
await _send(ws, "audio_pcm", {
|
||||
"requestId": request_id,
|
||||
"messageId": message_id,
|
||||
"base64": base64.b64encode(slice_).decode("ascii"),
|
||||
"format": "pcm_s16le",
|
||||
"sampleRate": sr,
|
||||
"channels": 1,
|
||||
"voice": voice or "default",
|
||||
"chunk": chunk_index,
|
||||
"final": False,
|
||||
})
|
||||
chunk_index += 1
|
||||
except Exception as e:
|
||||
logger.exception("F5-TTS Synthese-Fehler (Satz %d)", i)
|
||||
await _send(ws, "xtts_response", {
|
||||
"requestId": request_id,
|
||||
"error": str(e)[:200],
|
||||
})
|
||||
return
|
||||
|
||||
# Final-Marker
|
||||
await _send(ws, "audio_pcm", {
|
||||
"requestId": request_id,
|
||||
"messageId": message_id,
|
||||
"base64": "",
|
||||
"format": "pcm_s16le",
|
||||
"sampleRate": pcm_sr,
|
||||
"channels": 1,
|
||||
"voice": voice or "default",
|
||||
"chunk": chunk_index,
|
||||
"final": True,
|
||||
})
|
||||
|
||||
logger.info("TTS komplett: %d Chunks, %.2fs render (voice=%s, text=%d chars)",
|
||||
chunk_index, time.time() - t0, voice or "default", len(text))
|
||||
|
||||
|
||||
def _fade_in_pcm16(pcm: bytes, sr: int, fade_ms: int) -> bytes:
|
||||
"""Linear Fade-In auf erste fade_ms — maskiert Warmup-Glitches."""
|
||||
arr = np.frombuffer(pcm, dtype=np.int16).copy()
|
||||
fade_samples = min(int((fade_ms / 1000.0) * sr), len(arr))
|
||||
if fade_samples <= 0:
|
||||
return pcm
|
||||
ramp = np.linspace(0.0, 1.0, fade_samples, dtype=np.float32)
|
||||
arr[:fade_samples] = (arr[:fade_samples].astype(np.float32) * ramp).astype(np.int16)
|
||||
return arr.tobytes()
|
||||
|
||||
|
||||
# ── Voice Management Handlers ───────────────────────────────
|
||||
|
||||
async def handle_voice_upload(ws, payload: dict) -> None:
|
||||
name = (payload.get("name") or "").strip()
|
||||
samples = payload.get("samples") or []
|
||||
if not name or not samples:
|
||||
logger.warning("voice_upload: ungueltig (name=%r, samples=%d)", name, len(samples))
|
||||
return
|
||||
logger.info("Voice-Upload: '%s' (%d Samples)", name, len(samples))
|
||||
|
||||
try:
|
||||
VOICES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
safe = sanitize_voice_name(name)
|
||||
wav_path = VOICES_DIR / f"{safe}.wav"
|
||||
txt_path = VOICES_DIR / f"{safe}.txt"
|
||||
|
||||
# Samples zusammenfuegen
|
||||
buffers = [base64.b64decode(s.get("base64", "")) for s in samples]
|
||||
with open(wav_path, "wb") as f:
|
||||
for b in buffers:
|
||||
f.write(b)
|
||||
size_kb = wav_path.stat().st_size / 1024
|
||||
logger.info("Voice WAV gespeichert: %s (%.0fKB)", wav_path, size_kb)
|
||||
|
||||
# Auf 24kHz mono normalisieren (falls App in anderem Format liefert)
|
||||
ensure_24k_mono_wav(wav_path)
|
||||
|
||||
# Transkription ueber whisper-bridge anfragen
|
||||
logger.info("Transkribiere '%s' via whisper-bridge...", name)
|
||||
text = await request_transcription(ws, wav_path, language="de")
|
||||
if not text:
|
||||
logger.warning("Transkription fehlgeschlagen — speichere Platzhalter-Text")
|
||||
text = "Das ist ein Referenz Audio."
|
||||
txt_path.write_text(text.strip(), encoding="utf-8")
|
||||
logger.info("Voice '%s' komplett (txt: %s)", name, text[:80])
|
||||
|
||||
await _send(ws, "xtts_voice_saved", {
|
||||
"name": name, "size": int(size_kb * 1024), "refText": text.strip(),
|
||||
})
|
||||
# Liste aktualisieren
|
||||
await handle_list_voices(ws)
|
||||
except Exception as e:
|
||||
logger.exception("voice_upload Fehler")
|
||||
await _send(ws, "xtts_voice_saved", {"name": name, "error": str(e)[:200]})
|
||||
|
||||
|
||||
async def handle_list_voices(ws) -> None:
|
||||
try:
|
||||
voices = []
|
||||
if VOICES_DIR.exists():
|
||||
for wav in sorted(VOICES_DIR.glob("*.wav")):
|
||||
txt = wav.with_suffix(".txt")
|
||||
voices.append({
|
||||
"name": wav.stem,
|
||||
"file": wav.name,
|
||||
"size": wav.stat().st_size,
|
||||
"hasRefText": txt.exists(),
|
||||
})
|
||||
logger.info("Stimmen-Liste: %d", len(voices))
|
||||
await _send(ws, "xtts_voices_list", {"voices": voices})
|
||||
except Exception:
|
||||
logger.exception("handle_list_voices Fehler")
|
||||
|
||||
|
||||
async def handle_delete_voice(ws, payload: dict) -> None:
|
||||
name = (payload.get("name") or "").strip()
|
||||
if not name:
|
||||
return
|
||||
try:
|
||||
wav, txt = voice_paths(name)
|
||||
for p in (wav, txt):
|
||||
if p.exists():
|
||||
p.unlink()
|
||||
logger.info("Voice geloescht: %s", p)
|
||||
await handle_list_voices(ws)
|
||||
except Exception:
|
||||
logger.exception("handle_delete_voice Fehler")
|
||||
|
||||
|
||||
# Letzte diagnostisch-gesetzte Voice (verhindert Endlos-Preload bei jedem config)
|
||||
_last_diag_voice = ""
|
||||
|
||||
|
||||
async def handle_voice_preload(ws, payload: dict, runner: F5Runner) -> None:
|
||||
voice = (payload.get("voice") or "").strip()
|
||||
request_id = payload.get("requestId", "")
|
||||
logger.info("Voice-Preload angefordert: '%s'", voice or "default")
|
||||
|
||||
try:
|
||||
ref_wav, ref_txt = voice_paths(voice) if voice else (None, None)
|
||||
if voice and (not ref_wav or not ref_wav.exists()):
|
||||
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "error": "voice-file-not-found"})
|
||||
return
|
||||
|
||||
# Ref-Text sicherstellen (falls nur WAV da ist)
|
||||
if voice and ref_txt and not ref_txt.exists():
|
||||
text = await request_transcription(ws, ref_wav, language="de")
|
||||
if text:
|
||||
ref_txt.write_text(text.strip(), encoding="utf-8")
|
||||
logger.info("Referenz-Text beim Preload nachgezogen")
|
||||
|
||||
# Dummy-Render zum Warmup
|
||||
t0 = time.time()
|
||||
await _do_tts(ws, runner, "ja.", voice, f"preload-{request_id}", "", "de")
|
||||
ms = int((time.time() - t0) * 1000)
|
||||
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "loadMs": ms})
|
||||
except Exception as e:
|
||||
logger.exception("Voice-Preload Fehler")
|
||||
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "error": str(e)[:200]})
|
||||
|
||||
|
||||
# ── Haupt-Loop ──────────────────────────────────────────────
|
||||
|
||||
async def run_loop(runner: F5Runner) -> None:
|
||||
# Preload im Hintergrund starten damit der Startup nicht blockiert
|
||||
asyncio.create_task(runner.ensure_loaded())
|
||||
|
||||
use_tls = RVS_TLS
|
||||
retry_s = 2
|
||||
tls_fallback_tried = False
|
||||
global _last_diag_voice
|
||||
|
||||
while True:
|
||||
scheme = "wss" if use_tls else "ws"
|
||||
url = f"{scheme}://{RVS_HOST}:{RVS_PORT}/ws?token={RVS_TOKEN}"
|
||||
masked = url.replace(RVS_TOKEN, "***") if RVS_TOKEN else url
|
||||
|
||||
try:
|
||||
logger.info("Verbinde zu RVS: %s", masked)
|
||||
async with websockets.connect(url, ping_interval=20, ping_timeout=10, max_size=50 * 1024 * 1024) as ws:
|
||||
logger.info("RVS verbunden")
|
||||
retry_s = 2
|
||||
tls_fallback_tried = False
|
||||
|
||||
# TTS-Worker fuer diese Verbindung starten
|
||||
worker = asyncio.create_task(_tts_worker(ws, runner))
|
||||
|
||||
try:
|
||||
async for raw in ws:
|
||||
try:
|
||||
msg = json.loads(raw)
|
||||
except Exception:
|
||||
continue
|
||||
mtype = msg.get("type", "")
|
||||
payload = msg.get("payload", {}) or {}
|
||||
|
||||
if mtype == "xtts_request":
|
||||
await _tts_queue.put((
|
||||
payload.get("text", ""),
|
||||
payload.get("voice", "") or "",
|
||||
payload.get("requestId", ""),
|
||||
payload.get("messageId", ""),
|
||||
payload.get("language", "de"),
|
||||
))
|
||||
elif mtype == "voice_upload":
|
||||
asyncio.create_task(handle_voice_upload(ws, payload))
|
||||
elif mtype == "xtts_list_voices":
|
||||
asyncio.create_task(handle_list_voices(ws))
|
||||
elif mtype == "xtts_delete_voice":
|
||||
asyncio.create_task(handle_delete_voice(ws, payload))
|
||||
elif mtype == "voice_preload":
|
||||
asyncio.create_task(handle_voice_preload(ws, payload, runner))
|
||||
elif mtype == "stt_response":
|
||||
# Antwort auf unseren internen Transkriptions-Request
|
||||
req_id = payload.get("requestId", "")
|
||||
fut = _pending_stt.get(req_id)
|
||||
if fut and not fut.done():
|
||||
if payload.get("error"):
|
||||
fut.set_result(None)
|
||||
else:
|
||||
fut.set_result(payload.get("text") or "")
|
||||
elif mtype == "config":
|
||||
v = (payload.get("xttsVoice") or "").strip()
|
||||
if v and v != _last_diag_voice:
|
||||
_last_diag_voice = v
|
||||
asyncio.create_task(handle_voice_preload(
|
||||
ws, {"voice": v, "source": "diagnostic"}, runner,
|
||||
))
|
||||
elif not v:
|
||||
_last_diag_voice = ""
|
||||
finally:
|
||||
worker.cancel()
|
||||
try:
|
||||
await worker
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.warning("Verbindung verloren: %s", e)
|
||||
if use_tls and RVS_TLS_FALLBACK and not tls_fallback_tried:
|
||||
logger.info("TLS fehlgeschlagen — Fallback auf ws://")
|
||||
use_tls = False
|
||||
tls_fallback_tried = True
|
||||
continue
|
||||
await asyncio.sleep(min(retry_s, 30))
|
||||
retry_s = min(retry_s * 2, 30)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
if not RVS_HOST:
|
||||
logger.error("RVS_HOST nicht gesetzt — Abbruch")
|
||||
sys.exit(1)
|
||||
VOICES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
runner = F5Runner()
|
||||
await run_loop(runner)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
f5-tts>=1.0.0
|
||||
websockets>=12.0
|
||||
numpy>=1.24
|
||||
soundfile>=0.12
|
||||
requests>=2.31
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
{
|
||||
"name": "aria-xtts-bridge",
|
||||
"version": "1.0.0",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"ws": "^8.16.0"
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue