feat(phase2): XTTS durch F5-TTS ersetzt — Voice Cloning auf der Gamebox

Neuer aria-f5tts-bridge Container:
  - Python-Service, laedt F5TTS_v1_Base beim Start
  - Empfaengt xtts_request via RVS, synthetisiert mit Voice-Cloning,
    streamt PCM-Chunks (audio_pcm, 16-bit s16le) wie zuvor die XTTS-Bridge
  - Teilt lange Texte an Satzgrenzen, streamt satzweise
  - Fade-In auf erstem Chunk, Queue gegen parallel-Render

Voice-Management:
  - Speicherort weiterhin /voices/, aber jetzt als Paar
    {name}.wav + {name}.txt (F5-TTS braucht Referenz-Transkription)
  - voice_upload: WAV speichern, intern stt_request an whisper-bridge
    senden, Transkription als .txt ablegen → user muss nichts eintippen
  - On-the-fly Transkribierung: wenn eine WAV ohne .txt liegt, wird
    bei erstem Render/Preload nachgezogen
  - Bestehende RVS-Messages (voice_upload/xtts_list_voices/... etc.)
    bleiben unveraendert → keine App/Diagnostic-Aenderung noetig

Gaming-PC docker-compose:
  - xtts + xtts-bridge Services entfernt
  - f5tts-bridge + whisper-bridge bleiben/kommen rein
  - Volume xtts-models → f5tts-models

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
duffyduck 2026-04-24 14:34:11 +02:00
parent e170991222
commit 576ae925dd
7 changed files with 628 additions and 644 deletions

View File

@ -1,5 +0,0 @@
FROM node:22-alpine
WORKDIR /app
COPY bridge.js package.json ./
RUN npm install --production
CMD ["node", "bridge.js"]

View File

@ -1,596 +0,0 @@
/**
* ARIA XTTS Bridge Verbindet XTTS v2 Server mit dem RVS
*
* Empfaengt tts_request ueber RVS rendert Audio via XTTS API sendet zurueck
* Empfaengt voice_upload speichert Voice-Sample fuer Cloning
* Empfaengt xtts_list_voices listet verfuegbare Stimmen
*/
const WebSocket = require("ws");
const http = require("http");
const https = require("https");
const fs = require("fs");
const path = require("path");
const XTTS_API_URL = process.env.XTTS_API_URL || "http://xtts:8000";
const RVS_HOST = process.env.RVS_HOST || "";
const RVS_PORT = process.env.RVS_PORT || "443";
const RVS_TLS = process.env.RVS_TLS || "true";
const RVS_TLS_FALLBACK = process.env.RVS_TLS_FALLBACK || "true";
const RVS_TOKEN = process.env.RVS_TOKEN || "";
const VOICES_DIR = "/voices";
function log(msg) {
console.log(`[${new Date().toISOString()}] ${msg}`);
}
// ── RVS Verbindung ──────────────────────────────────
let rvsWs = null;
let retryDelay = 2;
function connectRVS(forcePlain) {
if (!RVS_HOST || !RVS_TOKEN) {
log("RVS nicht konfiguriert — beende");
process.exit(1);
}
const useTls = RVS_TLS === "true" && !forcePlain;
const proto = useTls ? "wss" : "ws";
const url = `${proto}://${RVS_HOST}:${RVS_PORT}?token=${RVS_TOKEN}`;
log(`Verbinde zu RVS: ${proto}://${RVS_HOST}:${RVS_PORT}`);
const ws = new WebSocket(url);
ws.on("open", () => {
log("RVS verbunden — warte auf TTS-Requests");
rvsWs = ws;
retryDelay = 2;
// Keepalive
setInterval(() => {
if (ws.readyState === WebSocket.OPEN) {
ws.ping();
ws.send(JSON.stringify({ type: "heartbeat", timestamp: Date.now() }));
}
}, 25000);
});
ws.on("message", async (raw) => {
try {
const msg = JSON.parse(raw.toString());
if (msg.type === "xtts_request") {
await handleTTSRequest(msg.payload);
} else if (msg.type === "voice_upload") {
await handleVoiceUpload(msg.payload);
} else if (msg.type === "xtts_list_voices") {
await handleListVoices();
} else if (msg.type === "xtts_delete_voice") {
await handleDeleteVoice(msg.payload);
} else if (msg.type === "voice_preload") {
await handleVoicePreload(msg.payload);
} else if (msg.type === "config") {
// Diagnostic hat globale Voice gewechselt → Preload damit der naechste
// Render ohne Ladewartezeit startet + alle Clients "voice_ready" sehen
const v = msg.payload && msg.payload.xttsVoice;
if (v && v !== lastDiagnosticVoice) {
lastDiagnosticVoice = v;
await handleVoicePreload({ voice: v, source: "diagnostic" });
} else if (!v) {
lastDiagnosticVoice = "";
}
}
} catch (err) {
log(`Fehler: ${err.message}`);
}
});
ws.on("close", () => {
log("RVS Verbindung geschlossen");
rvsWs = null;
setTimeout(() => connectRVS(), Math.min(retryDelay * 1000, 30000));
retryDelay = Math.min(retryDelay * 2, 30);
});
ws.on("error", (err) => {
log(`RVS Fehler: ${err.message}`);
if (useTls && RVS_TLS_FALLBACK === "true") {
log("TLS fehlgeschlagen — Fallback auf ws://");
ws.removeAllListeners();
try { ws.close(); } catch (_) {}
connectRVS(true);
}
});
}
// ── TTS Request Handler ─────────────────────────────
/**
* Linearer Fade-In auf einen base64-PCM-Chunk (s16le).
* Mascht XTTS-Warmup-Glitches am Anfang eines Renders.
*/
function applyFadeIn(base64Pcm, sampleRate, channels, fadeMs) {
const buf = Buffer.from(base64Pcm, "base64");
const totalSamples = buf.length / 2; // s16le
const fadeSamples = Math.min(
Math.floor((fadeMs / 1000) * sampleRate) * channels,
totalSamples
);
for (let i = 0; i < fadeSamples; i++) {
const sample = buf.readInt16LE(i * 2);
const gain = i / fadeSamples;
buf.writeInt16LE(Math.round(sample * gain), i * 2);
}
return buf.toString("base64");
}
// ── TTS-Queue ──────────────────────────────────────
// XTTS verarbeitet Requests sequenziell, damit Streams sich nicht ueberlappen.
// Ohne Queue wuerden parallele Requests parallel streamen → App bekommt
// interleaved PCM-Chunks aus zwei Rendern → klingt wie Chaos.
let ttsQueue = Promise.resolve();
// Merkt sich die letzte in Diagnostic gewaehlte Voice, damit wir nicht bei jedem
// config-Broadcast (auch ohne Aenderung) einen Preload triggern.
let lastDiagnosticVoice = "";
function handleTTSRequest(payload) {
ttsQueue = ttsQueue.then(() => _runTTSRequest(payload)).catch(err => {
log(`TTS-Queue Fehler: ${err.message}`);
});
return ttsQueue;
}
async function _runTTSRequest(payload) {
const { text, voice, requestId, language, messageId } = payload;
if (!text) return;
// Markdown-Cleanup (Bridge macht jetzt auch Cleanup, aber safety net)
let cleanText = text
.replace(/\*\*([^*]+)\*\*/g, "$1")
.replace(/\*([^*]+)\*/g, "$1")
.replace(/`([^`]+)`/g, "$1")
.replace(/```[\s\S]*?```/g, "")
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
.replace(/#{1,6}\s*/g, "")
.replace(/>\s*/g, "")
.replace(/[-*]\s+/g, "")
.replace(/\n{2,}/g, ". ")
.replace(/\n/g, ", ")
.replace(/\s{2,}/g, " ")
.replace(/["""„]/g, "")
.replace(/\(\)/g, "")
.trim();
log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);
try {
// Im local-Mode erwartet daswer123 XTTS speaker_wav als Basename (ohne .wav,
// ohne Pfad) — der Server prefixt EXAMPLE_FOLDER selbst. Wir checken hier
// nur das physische File ab um Warnungen zu loggen; runter ans API geht
// nur der Name.
const voiceFilePath = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
const hasCustomVoice = voiceFilePath && fs.existsSync(voiceFilePath);
const speakerName = hasCustomVoice ? voice : "";
if (voice && !hasCustomVoice) {
log(`WARNUNG: Voice "${voice}" angefordert, aber ${voiceFilePath} existiert nicht — nehme Default`);
} else if (hasCustomVoice) {
log(`Voice "${voice}" verwendet (speaker_wav="${speakerName}")`);
}
let chunkIndex = 0;
let pcmMeta = null;
let firstChunkSeen = false;
const onChunk = (pcmBase64, meta) => {
if (!pcmMeta) pcmMeta = meta;
let outBase64 = pcmBase64;
// Fade-In auf den ersten Chunk — maskiert XTTS-Warmup-Glitches
// (autoregressiver Generator hat am Anfang wenig Kontext → Artefakte).
if (!firstChunkSeen && pcmBase64) {
firstChunkSeen = true;
outBase64 = applyFadeIn(pcmBase64, meta.sampleRate, meta.channels, 120);
}
sendToRVS({
type: "audio_pcm",
payload: {
requestId: requestId || "",
messageId: messageId || "",
base64: outBase64,
format: "pcm_s16le",
sampleRate: meta.sampleRate,
channels: meta.channels,
voice: voice || "default",
chunk: chunkIndex++,
final: false,
},
timestamp: Date.now(),
});
};
// /tts_stream fuer echtes Streaming (funktioniert im XTTS local-Mode).
// Wenn Server im apiManual/api-Mode laeuft: 400 → Fallback auf /tts_to_audio/.
try {
await streamXTTSAsPCM(
cleanText,
language || "de",
speakerName,
onChunk,
);
} catch (streamErr) {
log(`/tts_stream fehlgeschlagen (${streamErr.message.slice(0, 100)}) — Fallback /tts_to_audio/`);
await streamXTTSBatch(
cleanText,
language || "de",
speakerName,
onChunk,
);
}
// Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
if (pcmMeta) {
sendToRVS({
type: "audio_pcm",
payload: {
requestId: requestId || "",
messageId: messageId || "",
base64: "",
format: "pcm_s16le",
sampleRate: pcmMeta.sampleRate,
channels: pcmMeta.channels,
voice: voice || "default",
chunk: chunkIndex++,
final: true,
},
timestamp: Date.now(),
});
}
log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);
} catch (err) {
log(`TTS Fehler: ${err.message}`);
sendToRVS({
type: "xtts_response",
payload: { requestId, error: err.message },
timestamp: Date.now(),
});
}
}
/**
* Ruft /tts_stream auf echter Streaming-Endpoint bei daswer123.
* Schickt was der Server verlangt (allow: GET), aber mit JSON-Body
* als POST scheitert mit 405. Manche Versionen wollen GET + Query,
* andere POST + JSON. Testen was funktioniert.
*/
function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
return new Promise((resolve, reject) => {
// Wichtig: speaker_wav MUSS als Query-Key dabei sein (Pydantic required) —
// auch bei default-voice mit leerem Wert. Sonst gibt's HTTP 422.
// stream_chunk_size=250: grosse Chunks = wenige Chunk-Grenzen = wenig
// Render-Artefakte. daswer123 erzeugt an Chunk-Boundaries haeufig Glitches
// in den Worten die ueber die Grenze gehen. Hoehere Latenz ist OK.
const qs = new URLSearchParams();
qs.set("text", text);
qs.set("language", language || "de");
qs.set("speaker_wav", speakerWav || "");
qs.set("stream_chunk_size", "250");
const url = new URL(XTTS_API_URL);
const fullPath = `/tts_stream?${qs.toString()}`;
const options = {
hostname: url.hostname,
port: url.port || 80,
path: fullPath,
method: "GET",
timeout: 60000,
};
log(`TTS GET /tts_stream?text=${text.slice(0, 30)}... (voice=${speakerWav ? "custom" : "default"})`);
const req = http.request(options, (res) => {
if (res.statusCode !== 200) {
let body = "";
res.on("data", (d) => { body += d.toString(); });
res.on("end", () => {
log(`XTTS /tts_stream ${res.statusCode}: ${body.slice(0, 300)}`);
reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`));
});
return;
}
log(`TTS stream verbunden, empfange PCM...`);
let headerParsed = false;
let sampleRate = 24000;
let channels = 1;
let leftover = Buffer.alloc(0); // ungerade Byte-Reste fuer das naechste Chunk
const HEADER_BYTES = 44;
let headerBuf = Buffer.alloc(0);
const PCM_CHUNK_BYTES = 8192; // ~170ms bei 24kHz s16 mono
res.on("data", (chunk) => {
let data = chunk;
// WAV-Header konsumieren (44 Bytes)
if (!headerParsed) {
headerBuf = Buffer.concat([headerBuf, data]);
if (headerBuf.length < HEADER_BYTES) return;
// Header lesen
const header = headerBuf.slice(0, HEADER_BYTES);
try {
channels = header.readUInt16LE(22);
sampleRate = header.readUInt32LE(24);
} catch (_) {}
headerParsed = true;
data = headerBuf.slice(HEADER_BYTES);
}
// leftover aus vorherigem Chunk + neuer data
let combined = Buffer.concat([leftover, data]);
// In PCM_CHUNK_BYTES-Happen zerlegen (Vielfache von 2 damit keine Sample-Splits)
while (combined.length >= PCM_CHUNK_BYTES) {
const slice = combined.slice(0, PCM_CHUNK_BYTES);
combined = combined.slice(PCM_CHUNK_BYTES);
onPcmChunk(slice.toString("base64"), { sampleRate, channels });
}
leftover = combined;
});
res.on("end", () => {
// Rest-Daten senden
if (leftover.length > 0) {
onPcmChunk(leftover.toString("base64"), { sampleRate, channels });
}
resolve();
});
res.on("error", reject);
});
req.on("error", reject);
req.on("timeout", () => { req.destroy(); reject(new Error("XTTS API Timeout (60s)")); });
req.end();
});
}
/**
* Fallback: /tts_to_audio/ (POST JSON) rendert komplett, dann response.
* Kein echtes Streaming, aber stabil als Backup wenn /tts_stream nicht geht.
* Shared chunking-Logik mit streamXTTSAsPCM parst WAV-Header, stueckelt PCM.
*/
function streamXTTSBatch(text, language, speakerWav, onPcmChunk) {
return new Promise((resolve, reject) => {
const body = JSON.stringify({
text,
language: language || "de",
speaker_wav: speakerWav || "",
});
const url = new URL(XTTS_API_URL);
const options = {
hostname: url.hostname,
port: url.port || 80,
path: "/tts_to_audio/",
method: "POST",
headers: {
"Content-Type": "application/json",
"Content-Length": Buffer.byteLength(body),
},
timeout: 60000,
};
const req = http.request(options, (res) => {
if (res.statusCode !== 200) {
let rb = "";
res.on("data", (d) => { rb += d.toString(); });
res.on("end", () => reject(new Error(`XTTS Batch HTTP ${res.statusCode}: ${rb.slice(0, 200)}`)));
return;
}
let headerParsed = false;
let sampleRate = 24000;
let channels = 1;
let leftover = Buffer.alloc(0);
let headerBuf = Buffer.alloc(0);
const HEADER_BYTES = 44;
const PCM_CHUNK_BYTES = 8192;
res.on("data", (chunk) => {
let data = chunk;
if (!headerParsed) {
headerBuf = Buffer.concat([headerBuf, data]);
if (headerBuf.length < HEADER_BYTES) return;
const header = headerBuf.slice(0, HEADER_BYTES);
try { channels = header.readUInt16LE(22); sampleRate = header.readUInt32LE(24); } catch (_) {}
headerParsed = true;
data = headerBuf.slice(HEADER_BYTES);
}
let combined = Buffer.concat([leftover, data]);
while (combined.length >= PCM_CHUNK_BYTES) {
const slice = combined.slice(0, PCM_CHUNK_BYTES);
combined = combined.slice(PCM_CHUNK_BYTES);
onPcmChunk(slice.toString("base64"), { sampleRate, channels });
}
leftover = combined;
});
res.on("end", () => {
if (leftover.length > 0) onPcmChunk(leftover.toString("base64"), { sampleRate, channels });
resolve();
});
res.on("error", reject);
});
req.on("error", reject);
req.on("timeout", () => { req.destroy(); reject(new Error("XTTS Batch Timeout (60s)")); });
req.write(body);
req.end();
});
}
// ── Voice Upload Handler ────────────────────────────
async function handleVoiceUpload(payload) {
const { name, samples } = payload;
if (!name || !samples || !Array.isArray(samples) || samples.length === 0) {
log("Voice Upload: Ungueltige Daten");
return;
}
log(`Voice Upload: "${name}" (${samples.length} Samples)`);
try {
// Alle Samples zusammenfuegen
const buffers = samples.map(s => Buffer.from(s.base64, "base64"));
const combined = Buffer.concat(buffers);
// Als WAV speichern
fs.mkdirSync(VOICES_DIR, { recursive: true });
const filePath = path.join(VOICES_DIR, `${name.replace(/[^a-zA-Z0-9_-]/g, "_")}.wav`);
fs.writeFileSync(filePath, combined);
log(`Voice gespeichert: ${filePath} (${(combined.length / 1024).toFixed(0)}KB)`);
sendToRVS({
type: "xtts_voice_saved",
payload: { name, size: combined.length, path: filePath },
timestamp: Date.now(),
});
} catch (err) {
log(`Voice Upload Fehler: ${err.message}`);
}
}
// ── Voice Delete Handler ────────────────────────────
async function handleDeleteVoice(payload) {
const { name } = payload || {};
if (!name || typeof name !== "string") {
log("Voice Delete: ungueltiger Name");
return;
}
const safe = name.replace(/[^a-zA-Z0-9_-]/g, "_");
const filePath = path.join(VOICES_DIR, `${safe}.wav`);
try {
if (fs.existsSync(filePath)) {
fs.unlinkSync(filePath);
log(`Voice geloescht: ${filePath}`);
} else {
log(`Voice Delete: Datei existiert nicht (${filePath})`);
}
// Aktualisierte Liste an alle Clients senden
await handleListVoices();
} catch (err) {
log(`Voice Delete Fehler: ${err.message}`);
}
}
// ── Voice List Handler ──────────────────────────────
/**
* Preload einer Stimme rendert stumm ein kurzes Dummy-Audio, damit XTTS
* die Speaker-Latents laedt und der naechste echte Request ohne Wartezeit
* loslegen kann. Broadcastet "voice_ready" wenn fertig (oder mit error).
*/
async function handleVoicePreload(payload) {
const voice = (payload && payload.voice) || "";
const source = (payload && payload.source) || "unknown";
const requestId = (payload && payload.requestId) || "";
log(`Voice-Preload angefordert: "${voice}" (source=${source})`);
try {
let speakerName = "";
if (voice) {
const voiceFilePath = path.join(VOICES_DIR, `${voice}.wav`);
if (!fs.existsSync(voiceFilePath)) {
sendToRVS({
type: "voice_ready",
payload: { voice, requestId, error: "voice-file-not-found" },
timestamp: Date.now(),
});
log(`Preload abgebrochen: ${voiceFilePath} existiert nicht`);
return;
}
speakerName = voice;
}
// Dummy-Request via Queue — damit sich Preload nicht mit echtem TTS ueberholt.
const t0 = Date.now();
await new Promise((resolve, reject) => {
ttsQueue = ttsQueue.then(async () => {
try {
await streamXTTSAsPCM("ja.", "de", speakerName, () => {});
resolve();
} catch (err) {
reject(err);
}
}).catch(reject);
});
const ms = Date.now() - t0;
log(`Voice "${voice || "default"}" geladen in ${ms}ms`);
sendToRVS({
type: "voice_ready",
payload: { voice, requestId, loadMs: ms },
timestamp: Date.now(),
});
} catch (err) {
log(`Voice-Preload Fehler: ${err.message}`);
sendToRVS({
type: "voice_ready",
payload: { voice, requestId, error: err.message.slice(0, 200) },
timestamp: Date.now(),
});
}
}
async function handleListVoices() {
try {
const files = fs.existsSync(VOICES_DIR)
? fs.readdirSync(VOICES_DIR).filter(f => f.endsWith(".wav"))
: [];
const voices = files.map(f => ({
name: path.basename(f, ".wav"),
file: f,
size: fs.statSync(path.join(VOICES_DIR, f)).size,
}));
log(`Stimmen: ${voices.length} verfuegbar`);
sendToRVS({
type: "xtts_voices_list",
payload: { voices },
timestamp: Date.now(),
});
} catch (err) {
log(`Stimmen-Liste Fehler: ${err.message}`);
}
}
// ── RVS senden ──────────────────────────────────────
function sendToRVS(msg) {
if (rvsWs && rvsWs.readyState === WebSocket.OPEN) {
rvsWs.send(JSON.stringify(msg));
}
}
// ── Start ───────────────────────────────────────────
log("ARIA XTTS Bridge startet...");
log(`XTTS API: ${XTTS_API_URL}`);
log(`RVS: ${RVS_HOST}:${RVS_PORT}`);
// Warten bis XTTS API erreichbar ist
function waitForXTTS(callback, attempts) {
if (attempts <= 0) { log("XTTS API nicht erreichbar — starte trotzdem"); callback(); return; }
http.get(`${XTTS_API_URL}/docs`, (res) => {
log(`XTTS API erreichbar (HTTP ${res.statusCode})`);
callback();
}).on("error", () => {
log(`XTTS API noch nicht bereit — warte (${attempts} Versuche uebrig)...`);
setTimeout(() => waitForXTTS(callback, attempts - 1), 10000); // 10s statt 5s (Model laden dauert)
});
}
waitForXTTS(() => connectRVS(), 30); // Max 5min warten

View File

@ -1,7 +1,7 @@
# ════════════════════════════════════════════════
# ARIA XTTS v2 — GPU TTS Server
# ARIA Gamebox Stack — GPU F5-TTS + Whisper STT
# Laeuft auf dem Gaming-PC (RTX 3060)
# Verbindet sich zum RVS fuer TTS-Requests
# Verbindet sich zum RVS fuer TTS/STT-Requests
# ════════════════════════════════════════════════
#
# Voraussetzungen:
@ -10,15 +10,18 @@
# - .env mit RVS-Verbindungsdaten
#
# Start: docker compose up -d
# Test: curl http://localhost:8000/docs
# ════════════════════════════════════════════════
services:
# ─── XTTS v2 API Server (GPU) ─────────────────
xtts:
image: daswer123/xtts-api-server:latest
container_name: aria-xtts
# ─── F5-TTS Bridge (GPU) ──────────────────────
# Ersetzt den frueheren XTTS-Stack. Empfaengt xtts_request via RVS,
# rendert via F5-TTS mit Voice-Cloning, streamt PCM an die App.
# Voice-Upload: speichert WAV und laesst whisper-bridge den Referenz-
# text transkribieren — der User muss nichts eintippen.
f5tts-bridge:
build: ./f5tts
container_name: aria-f5tts-bridge
deploy:
resources:
reservations:
@ -26,45 +29,29 @@ services:
- driver: nvidia
count: 1
capabilities: [gpu]
ports:
- "8000:8020"
volumes:
- xtts-models:/app/xtts_models # Model-Cache (~2GB)
- ./voices:/voices # Custom Voice Samples
- ./voices:/voices # WAV + TXT Referenz
- f5tts-models:/root/.cache/huggingface # Model-Cache persistieren
environment:
- COQUI_TOS_AGREED=1
# Local-Modus statt default "apiManual": Modell bleibt im GPU-VRAM,
# Render startet sofort, /tts_stream funktioniert.
# Default-CMD des Images liest diese ENV: -ms ${MODEL_SOURCE:-"apiManual"}
- MODEL_SOURCE=local
# Speaker-Folder auf unsere gemounteten voices zeigen lassen
- EXAMPLE_FOLDER=/voices
restart: unless-stopped
# ─── XTTS Bridge (verbindet zu RVS) ───────────
xtts-bridge:
build: .
container_name: aria-xtts-bridge
depends_on:
- xtts
volumes:
- ./voices:/voices # Shared mit XTTS-Server
environment:
- XTTS_API_URL=http://xtts:8020
- RVS_HOST=${RVS_HOST}
- RVS_PORT=${RVS_PORT:-443}
- RVS_TLS=${RVS_TLS:-true}
- RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true}
- RVS_TOKEN=${RVS_TOKEN}
- F5TTS_MODEL=${F5TTS_MODEL:-F5TTS_v1_Base}
- F5TTS_DEVICE=${F5TTS_DEVICE:-cuda}
- VOICES_DIR=/voices
restart: unless-stopped
# ─── Whisper STT (GPU) ────────────────────────
# Faster-Whisper auf der Gamebox statt auf der VM (CPU) —
# deutlich schneller. Verbindet sich selbst per WebSocket an
# den RVS und nimmt dort stt_request Nachrichten der aria-bridge
# entgegen, antwortet mit stt_response. Laedt das Modell beim
# Start vor; auf Config-Broadcasts (Diagnostic → whisperModel)
# wird zur Laufzeit hot-swapped.
# entgegen, antwortet mit stt_response. Zusaetzlich nutzt die
# f5tts-bridge Whisper intern fuer die Referenz-Transkription bei
# Voice-Uploads. Laedt das Modell beim Start vor; auf Config-
# Broadcasts (Diagnostic → whisperModel) wird zur Laufzeit hot-
# swapped.
whisper-bridge:
build: ./whisper
container_name: aria-whisper-bridge
@ -86,9 +73,9 @@ services:
- WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
- WHISPER_LANGUAGE=${WHISPER_LANGUAGE:-de}
volumes:
- whisper-models:/root/.cache/huggingface # Model-Cache persistieren
- whisper-models:/root/.cache/huggingface
restart: unless-stopped
volumes:
xtts-models:
f5tts-models:
whisper-models:

21
xtts/f5tts/Dockerfile Normal file
View File

@ -0,0 +1,21 @@
FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip ffmpeg git \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# PyTorch CUDA-Wheels zuerst (f5-tts zieht sonst CPU-only Torch rein)
RUN pip3 install --no-cache-dir torch==2.3.1 torchaudio==2.3.1 \
--index-url https://download.pytorch.org/whl/cu121
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt
COPY bridge.py .
CMD ["python3", "bridge.py"]

580
xtts/f5tts/bridge.py Normal file
View File

@ -0,0 +1,580 @@
#!/usr/bin/env python3
"""
ARIA F5-TTS Bridge laeuft auf der Gamebox (RTX 3060).
Empfaengt xtts_request via RVS F5-TTS Voice Cloning auf GPU streamt
16-bit PCM Chunks als audio_pcm Nachrichten zurueck an die App.
Voice-Layout im VOICES_DIR:
{name}.wav Referenz-Audio (6-10s, 24kHz mono empfohlen)
{name}.txt Referenz-Text (UTF-8, was im WAV gesprochen wird)
Beim voice_upload senden wir intern einen stt_request an die whisper-bridge
und legen die Transkription als .txt ab der User muss keinen Text eingeben.
Env:
RVS_HOST, RVS_PORT, RVS_TLS, RVS_TLS_FALLBACK, RVS_TOKEN
F5TTS_MODEL Default: F5TTS_v1_Base
F5TTS_DEVICE Default: cuda
VOICES_DIR Default: /voices
"""
import asyncio
import base64
import json
import logging
import os
import re
import subprocess
import sys
import tempfile
import time
import uuid
from pathlib import Path
from typing import Optional
import numpy as np
import soundfile as sf
import websockets
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger("f5tts-bridge")
# HuggingFace + Torch download-Logs etwas daempfen
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
RVS_HOST = os.getenv("RVS_HOST", "").strip()
RVS_PORT = int(os.getenv("RVS_PORT", "443"))
RVS_TLS = os.getenv("RVS_TLS", "true").lower() == "true"
RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true").lower() == "true"
RVS_TOKEN = os.getenv("RVS_TOKEN", "").strip()
F5TTS_MODEL = os.getenv("F5TTS_MODEL", "F5TTS_v1_Base")
F5TTS_DEVICE = os.getenv("F5TTS_DEVICE", "cuda")
VOICES_DIR = Path(os.getenv("VOICES_DIR", "/voices"))
PCM_CHUNK_BYTES = 8192 # ~170ms @ 24kHz mono s16
TARGET_SR = 24000 # F5-TTS native
# ── Lazy F5-TTS Loader ──────────────────────────────────────
_F5TTS_cls = None
def _get_f5tts_cls():
"""Lazy import damit Startup-Logs nicht durch Torch-Warnungen zumuellen."""
global _F5TTS_cls
if _F5TTS_cls is None:
from f5_tts.api import F5TTS as _cls
_F5TTS_cls = _cls
return _F5TTS_cls
class F5Runner:
"""Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking)."""
def __init__(self) -> None:
self.model = None
self._lock = asyncio.Lock()
def _load_blocking(self) -> None:
cls = _get_f5tts_cls()
logger.info("Lade F5-TTS '%s' (device=%s)...", F5TTS_MODEL, F5TTS_DEVICE)
t0 = time.time()
self.model = cls(model=F5TTS_MODEL, device=F5TTS_DEVICE)
logger.info("F5-TTS geladen in %.1fs", time.time() - t0)
async def ensure_loaded(self) -> None:
async with self._lock:
if self.model is not None:
return
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, self._load_blocking)
def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
wav, sr, _ = self.model.infer(
ref_file=ref_wav,
ref_text=ref_text,
gen_text=gen_text,
remove_silence=True,
seed=-1,
)
# F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard
if not isinstance(wav, np.ndarray):
wav = np.asarray(wav, dtype=np.float32)
if wav.ndim > 1:
wav = wav.squeeze()
return wav.astype(np.float32), int(sr)
async def synthesize(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
await self.ensure_loaded()
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self._infer_blocking, gen_text, ref_wav, ref_text)
# ── Helpers ─────────────────────────────────────────────────
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+")
def split_sentences(text: str, max_len: int = 350) -> list[str]:
"""Teilt langen Text an Satzgrenzen. Kurze Texte bleiben als-is."""
text = text.strip()
if not text:
return []
if len(text) <= max_len:
return [text]
parts = [p.strip() for p in _SENTENCE_SPLIT.split(text) if p.strip()]
# Zu kurze Fragmente mergen damit F5-TTS nicht an jedem Komma neu startet
merged: list[str] = []
buf = ""
for p in parts:
if len(buf) + len(p) + 1 <= max_len:
buf = f"{buf} {p}".strip()
else:
if buf:
merged.append(buf)
buf = p
if buf:
merged.append(buf)
return merged or [text]
def float_to_pcm16(wav: np.ndarray) -> bytes:
"""Float32 (-1..+1) → int16 little-endian bytes."""
wav = np.clip(wav, -1.0, 1.0)
pcm = (wav * 32767.0).astype(np.int16)
return pcm.tobytes()
def sanitize_voice_name(name: str) -> str:
return re.sub(r"[^a-zA-Z0-9_-]", "_", name)
def voice_paths(name: str) -> tuple[Path, Path]:
safe = sanitize_voice_name(name)
return VOICES_DIR / f"{safe}.wav", VOICES_DIR / f"{safe}.txt"
def ensure_24k_mono_wav(src_wav: Path) -> Path:
"""F5-TTS moechte 24kHz mono als Referenz — ffmpeg konvertiert inplace.
Wenn das File schon passt, wird nichts geaendert. Sonst wird es
reingeschrieben (Original wird ueberschrieben).
"""
try:
info = sf.info(str(src_wav))
if info.samplerate == TARGET_SR and info.channels == 1:
return src_wav
except Exception:
pass
tmp_out = src_wav.with_suffix(".conv.wav")
cmd = ["ffmpeg", "-y", "-i", str(src_wav),
"-ar", str(TARGET_SR), "-ac", "1", "-f", "wav", str(tmp_out)]
r = subprocess.run(cmd, capture_output=True, timeout=30)
if r.returncode != 0:
logger.warning("ffmpeg-Konvertierung von %s fehlgeschlagen: %s",
src_wav, r.stderr.decode(errors="replace")[:200])
try:
tmp_out.unlink()
except OSError:
pass
return src_wav
os.replace(tmp_out, src_wav)
return src_wav
async def _send(ws, mtype: str, payload: dict) -> None:
try:
await ws.send(json.dumps({
"type": mtype,
"payload": payload,
"timestamp": int(time.time() * 1000),
}))
except Exception as e:
logger.warning("Send fehlgeschlagen (%s): %s", mtype, e)
# ── Interne Transkription via whisper-bridge ────────────────
_pending_stt: dict[str, asyncio.Future] = {}
_STT_TIMEOUT_S = 60.0
async def request_transcription(ws, wav_path: Path, language: str = "de") -> Optional[str]:
"""Sendet einen stt_request an die whisper-bridge (ueber RVS) und wartet auf stt_response."""
try:
with open(wav_path, "rb") as f:
audio_b64 = base64.b64encode(f.read()).decode("ascii")
except Exception as e:
logger.error("Lesen %s fehlgeschlagen: %s", wav_path, e)
return None
request_id = str(uuid.uuid4())
loop = asyncio.get_event_loop()
fut: asyncio.Future = loop.create_future()
_pending_stt[request_id] = fut
try:
await _send(ws, "stt_request", {
"requestId": request_id,
"audio": audio_b64,
"mimeType": "audio/wav",
"model": "small", # klein reicht fuer Voice-Referenz
"language": language,
})
return await asyncio.wait_for(fut, timeout=_STT_TIMEOUT_S)
except asyncio.TimeoutError:
logger.warning("Transkription Timeout fuer %s", wav_path.name)
return None
except Exception as e:
logger.warning("Transkription Fehler: %s", e)
return None
finally:
_pending_stt.pop(request_id, None)
# ── TTS-Request Handler ─────────────────────────────────────
# Queue damit sich parallele Requests nicht ueberlappen (GPU-Throughput)
_tts_queue: asyncio.Queue[tuple] = asyncio.Queue()
async def _tts_worker(ws, runner: F5Runner) -> None:
"""Serialisiert Synthesen — GPU kann sonst OOM gehen."""
while True:
text, voice, request_id, message_id, language = await _tts_queue.get()
try:
await _do_tts(ws, runner, text, voice, request_id, message_id, language)
except Exception:
logger.exception("TTS-Worker Fehler")
finally:
_tts_queue.task_done()
async def _do_tts(ws, runner: F5Runner, text: str, voice: str,
request_id: str, message_id: str, language: str) -> None:
t0 = time.time()
ref_wav_path, ref_txt_path = voice_paths(voice) if voice else (None, None)
has_custom = bool(voice and ref_wav_path and ref_wav_path.exists() and ref_txt_path.exists())
if voice and not has_custom:
# Wenn nur WAV da ist aber kein txt → on-the-fly transkribieren
if ref_wav_path and ref_wav_path.exists() and (not ref_txt_path or not ref_txt_path.exists()):
logger.info("Voice '%s' hat kein txt — transkribiere on-the-fly", voice)
text_ref = await request_transcription(ws, ref_wav_path, language)
if text_ref:
try:
ref_txt_path.write_text(text_ref.strip(), encoding="utf-8")
has_custom = True
logger.info("Referenz-Text nachgezogen: '%s'", text_ref[:60])
except Exception as e:
logger.warning("Referenz-Text speichern fehlgeschlagen: %s", e)
if not has_custom:
logger.warning("Voice '%s' nicht komplett (%s, txt=%s) — nehme Default",
voice, ref_wav_path, (ref_txt_path and ref_txt_path.exists()))
if has_custom:
ref_wav_str = str(ref_wav_path)
ref_text = ref_txt_path.read_text(encoding="utf-8").strip()
else:
# Fallback: kein Custom-Voice. F5-TTS braucht IMMER eine Referenz,
# wir nehmen default_ref.wav/txt falls vorhanden, sonst die erste
# gefundene Voice im Ordner.
default_wav = VOICES_DIR / "default_ref.wav"
default_txt = VOICES_DIR / "default_ref.txt"
if default_wav.exists() and default_txt.exists():
ref_wav_str = str(default_wav)
ref_text = default_txt.read_text(encoding="utf-8").strip()
else:
# Nimm irgendein vorhandenes voice-Paar
pair = next(
((w, t) for w, t in (
(v, v.with_suffix(".txt")) for v in VOICES_DIR.glob("*.wav")
) if t.exists()),
None,
)
if not pair:
logger.error("Keine Referenz-Stimme im VOICES_DIR — TTS abgebrochen")
return
ref_wav_str, ref_text = str(pair[0]), pair[1].read_text(encoding="utf-8").strip()
sentences = split_sentences(text)
logger.info("F5-TTS: %d Satz(e), voice=%s (%s)", len(sentences), voice or "default", ref_wav_str)
chunk_index = 0
pcm_sr = TARGET_SR
for i, sent in enumerate(sentences):
try:
wav, sr = await runner.synthesize(sent, ref_wav_str, ref_text)
pcm_sr = sr
pcm_bytes = float_to_pcm16(wav)
# Erste PCM-Chunk des allerersten Satzes bekommt Fade-In (maskiert
# eventuelle Warmup-Glitches). Alle anderen Chunks bleiben wie sind.
if i == 0 and chunk_index == 0:
pcm_bytes = _fade_in_pcm16(pcm_bytes, sr, 120)
# Stueckeln
for off in range(0, len(pcm_bytes), PCM_CHUNK_BYTES):
slice_ = pcm_bytes[off:off + PCM_CHUNK_BYTES]
await _send(ws, "audio_pcm", {
"requestId": request_id,
"messageId": message_id,
"base64": base64.b64encode(slice_).decode("ascii"),
"format": "pcm_s16le",
"sampleRate": sr,
"channels": 1,
"voice": voice or "default",
"chunk": chunk_index,
"final": False,
})
chunk_index += 1
except Exception as e:
logger.exception("F5-TTS Synthese-Fehler (Satz %d)", i)
await _send(ws, "xtts_response", {
"requestId": request_id,
"error": str(e)[:200],
})
return
# Final-Marker
await _send(ws, "audio_pcm", {
"requestId": request_id,
"messageId": message_id,
"base64": "",
"format": "pcm_s16le",
"sampleRate": pcm_sr,
"channels": 1,
"voice": voice or "default",
"chunk": chunk_index,
"final": True,
})
logger.info("TTS komplett: %d Chunks, %.2fs render (voice=%s, text=%d chars)",
chunk_index, time.time() - t0, voice or "default", len(text))
def _fade_in_pcm16(pcm: bytes, sr: int, fade_ms: int) -> bytes:
"""Linear Fade-In auf erste fade_ms — maskiert Warmup-Glitches."""
arr = np.frombuffer(pcm, dtype=np.int16).copy()
fade_samples = min(int((fade_ms / 1000.0) * sr), len(arr))
if fade_samples <= 0:
return pcm
ramp = np.linspace(0.0, 1.0, fade_samples, dtype=np.float32)
arr[:fade_samples] = (arr[:fade_samples].astype(np.float32) * ramp).astype(np.int16)
return arr.tobytes()
# ── Voice Management Handlers ───────────────────────────────
async def handle_voice_upload(ws, payload: dict) -> None:
name = (payload.get("name") or "").strip()
samples = payload.get("samples") or []
if not name or not samples:
logger.warning("voice_upload: ungueltig (name=%r, samples=%d)", name, len(samples))
return
logger.info("Voice-Upload: '%s' (%d Samples)", name, len(samples))
try:
VOICES_DIR.mkdir(parents=True, exist_ok=True)
safe = sanitize_voice_name(name)
wav_path = VOICES_DIR / f"{safe}.wav"
txt_path = VOICES_DIR / f"{safe}.txt"
# Samples zusammenfuegen
buffers = [base64.b64decode(s.get("base64", "")) for s in samples]
with open(wav_path, "wb") as f:
for b in buffers:
f.write(b)
size_kb = wav_path.stat().st_size / 1024
logger.info("Voice WAV gespeichert: %s (%.0fKB)", wav_path, size_kb)
# Auf 24kHz mono normalisieren (falls App in anderem Format liefert)
ensure_24k_mono_wav(wav_path)
# Transkription ueber whisper-bridge anfragen
logger.info("Transkribiere '%s' via whisper-bridge...", name)
text = await request_transcription(ws, wav_path, language="de")
if not text:
logger.warning("Transkription fehlgeschlagen — speichere Platzhalter-Text")
text = "Das ist ein Referenz Audio."
txt_path.write_text(text.strip(), encoding="utf-8")
logger.info("Voice '%s' komplett (txt: %s)", name, text[:80])
await _send(ws, "xtts_voice_saved", {
"name": name, "size": int(size_kb * 1024), "refText": text.strip(),
})
# Liste aktualisieren
await handle_list_voices(ws)
except Exception as e:
logger.exception("voice_upload Fehler")
await _send(ws, "xtts_voice_saved", {"name": name, "error": str(e)[:200]})
async def handle_list_voices(ws) -> None:
try:
voices = []
if VOICES_DIR.exists():
for wav in sorted(VOICES_DIR.glob("*.wav")):
txt = wav.with_suffix(".txt")
voices.append({
"name": wav.stem,
"file": wav.name,
"size": wav.stat().st_size,
"hasRefText": txt.exists(),
})
logger.info("Stimmen-Liste: %d", len(voices))
await _send(ws, "xtts_voices_list", {"voices": voices})
except Exception:
logger.exception("handle_list_voices Fehler")
async def handle_delete_voice(ws, payload: dict) -> None:
name = (payload.get("name") or "").strip()
if not name:
return
try:
wav, txt = voice_paths(name)
for p in (wav, txt):
if p.exists():
p.unlink()
logger.info("Voice geloescht: %s", p)
await handle_list_voices(ws)
except Exception:
logger.exception("handle_delete_voice Fehler")
# Letzte diagnostisch-gesetzte Voice (verhindert Endlos-Preload bei jedem config)
_last_diag_voice = ""
async def handle_voice_preload(ws, payload: dict, runner: F5Runner) -> None:
voice = (payload.get("voice") or "").strip()
request_id = payload.get("requestId", "")
logger.info("Voice-Preload angefordert: '%s'", voice or "default")
try:
ref_wav, ref_txt = voice_paths(voice) if voice else (None, None)
if voice and (not ref_wav or not ref_wav.exists()):
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "error": "voice-file-not-found"})
return
# Ref-Text sicherstellen (falls nur WAV da ist)
if voice and ref_txt and not ref_txt.exists():
text = await request_transcription(ws, ref_wav, language="de")
if text:
ref_txt.write_text(text.strip(), encoding="utf-8")
logger.info("Referenz-Text beim Preload nachgezogen")
# Dummy-Render zum Warmup
t0 = time.time()
await _do_tts(ws, runner, "ja.", voice, f"preload-{request_id}", "", "de")
ms = int((time.time() - t0) * 1000)
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "loadMs": ms})
except Exception as e:
logger.exception("Voice-Preload Fehler")
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "error": str(e)[:200]})
# ── Haupt-Loop ──────────────────────────────────────────────
async def run_loop(runner: F5Runner) -> None:
# Preload im Hintergrund starten damit der Startup nicht blockiert
asyncio.create_task(runner.ensure_loaded())
use_tls = RVS_TLS
retry_s = 2
tls_fallback_tried = False
global _last_diag_voice
while True:
scheme = "wss" if use_tls else "ws"
url = f"{scheme}://{RVS_HOST}:{RVS_PORT}/ws?token={RVS_TOKEN}"
masked = url.replace(RVS_TOKEN, "***") if RVS_TOKEN else url
try:
logger.info("Verbinde zu RVS: %s", masked)
async with websockets.connect(url, ping_interval=20, ping_timeout=10, max_size=50 * 1024 * 1024) as ws:
logger.info("RVS verbunden")
retry_s = 2
tls_fallback_tried = False
# TTS-Worker fuer diese Verbindung starten
worker = asyncio.create_task(_tts_worker(ws, runner))
try:
async for raw in ws:
try:
msg = json.loads(raw)
except Exception:
continue
mtype = msg.get("type", "")
payload = msg.get("payload", {}) or {}
if mtype == "xtts_request":
await _tts_queue.put((
payload.get("text", ""),
payload.get("voice", "") or "",
payload.get("requestId", ""),
payload.get("messageId", ""),
payload.get("language", "de"),
))
elif mtype == "voice_upload":
asyncio.create_task(handle_voice_upload(ws, payload))
elif mtype == "xtts_list_voices":
asyncio.create_task(handle_list_voices(ws))
elif mtype == "xtts_delete_voice":
asyncio.create_task(handle_delete_voice(ws, payload))
elif mtype == "voice_preload":
asyncio.create_task(handle_voice_preload(ws, payload, runner))
elif mtype == "stt_response":
# Antwort auf unseren internen Transkriptions-Request
req_id = payload.get("requestId", "")
fut = _pending_stt.get(req_id)
if fut and not fut.done():
if payload.get("error"):
fut.set_result(None)
else:
fut.set_result(payload.get("text") or "")
elif mtype == "config":
v = (payload.get("xttsVoice") or "").strip()
if v and v != _last_diag_voice:
_last_diag_voice = v
asyncio.create_task(handle_voice_preload(
ws, {"voice": v, "source": "diagnostic"}, runner,
))
elif not v:
_last_diag_voice = ""
finally:
worker.cancel()
try:
await worker
except asyncio.CancelledError:
pass
except Exception as e:
logger.warning("Verbindung verloren: %s", e)
if use_tls and RVS_TLS_FALLBACK and not tls_fallback_tried:
logger.info("TLS fehlgeschlagen — Fallback auf ws://")
use_tls = False
tls_fallback_tried = True
continue
await asyncio.sleep(min(retry_s, 30))
retry_s = min(retry_s * 2, 30)
async def main() -> None:
if not RVS_HOST:
logger.error("RVS_HOST nicht gesetzt — Abbruch")
sys.exit(1)
VOICES_DIR.mkdir(parents=True, exist_ok=True)
runner = F5Runner()
await run_loop(runner)
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
sys.exit(0)

View File

@ -0,0 +1,5 @@
f5-tts>=1.0.0
websockets>=12.0
numpy>=1.24
soundfile>=0.12
requests>=2.31

View File

@ -1,8 +0,0 @@
{
"name": "aria-xtts-bridge",
"version": "1.0.0",
"private": true,
"dependencies": {
"ws": "^8.16.0"
}
}