feat(phase2): XTTS durch F5-TTS ersetzt — Voice Cloning auf der Gamebox
Neuer aria-f5tts-bridge Container:
- Python-Service, laedt F5TTS_v1_Base beim Start
- Empfaengt xtts_request via RVS, synthetisiert mit Voice-Cloning,
streamt PCM-Chunks (audio_pcm, 16-bit s16le) wie zuvor die XTTS-Bridge
- Teilt lange Texte an Satzgrenzen, streamt satzweise
- Fade-In auf erstem Chunk, Queue gegen parallel-Render
Voice-Management:
- Speicherort weiterhin /voices/, aber jetzt als Paar
{name}.wav + {name}.txt (F5-TTS braucht Referenz-Transkription)
- voice_upload: WAV speichern, intern stt_request an whisper-bridge
senden, Transkription als .txt ablegen → user muss nichts eintippen
- On-the-fly Transkribierung: wenn eine WAV ohne .txt liegt, wird
bei erstem Render/Preload nachgezogen
- Bestehende RVS-Messages (voice_upload/xtts_list_voices/... etc.)
bleiben unveraendert → keine App/Diagnostic-Aenderung noetig
Gaming-PC docker-compose:
- xtts + xtts-bridge Services entfernt
- f5tts-bridge + whisper-bridge bleiben/kommen rein
- Volume xtts-models → f5tts-models
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e170991222
commit
576ae925dd
|
|
@ -1,5 +0,0 @@
|
||||||
FROM node:22-alpine
|
|
||||||
WORKDIR /app
|
|
||||||
COPY bridge.js package.json ./
|
|
||||||
RUN npm install --production
|
|
||||||
CMD ["node", "bridge.js"]
|
|
||||||
596
xtts/bridge.js
596
xtts/bridge.js
|
|
@ -1,596 +0,0 @@
|
||||||
/**
|
|
||||||
* ARIA XTTS Bridge — Verbindet XTTS v2 Server mit dem RVS
|
|
||||||
*
|
|
||||||
* Empfaengt tts_request ueber RVS → rendert Audio via XTTS API → sendet zurueck
|
|
||||||
* Empfaengt voice_upload → speichert Voice-Sample fuer Cloning
|
|
||||||
* Empfaengt xtts_list_voices → listet verfuegbare Stimmen
|
|
||||||
*/
|
|
||||||
|
|
||||||
const WebSocket = require("ws");
|
|
||||||
const http = require("http");
|
|
||||||
const https = require("https");
|
|
||||||
const fs = require("fs");
|
|
||||||
const path = require("path");
|
|
||||||
|
|
||||||
const XTTS_API_URL = process.env.XTTS_API_URL || "http://xtts:8000";
|
|
||||||
const RVS_HOST = process.env.RVS_HOST || "";
|
|
||||||
const RVS_PORT = process.env.RVS_PORT || "443";
|
|
||||||
const RVS_TLS = process.env.RVS_TLS || "true";
|
|
||||||
const RVS_TLS_FALLBACK = process.env.RVS_TLS_FALLBACK || "true";
|
|
||||||
const RVS_TOKEN = process.env.RVS_TOKEN || "";
|
|
||||||
const VOICES_DIR = "/voices";
|
|
||||||
|
|
||||||
function log(msg) {
|
|
||||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── RVS Verbindung ──────────────────────────────────
|
|
||||||
|
|
||||||
let rvsWs = null;
|
|
||||||
let retryDelay = 2;
|
|
||||||
|
|
||||||
function connectRVS(forcePlain) {
|
|
||||||
if (!RVS_HOST || !RVS_TOKEN) {
|
|
||||||
log("RVS nicht konfiguriert — beende");
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
const useTls = RVS_TLS === "true" && !forcePlain;
|
|
||||||
const proto = useTls ? "wss" : "ws";
|
|
||||||
const url = `${proto}://${RVS_HOST}:${RVS_PORT}?token=${RVS_TOKEN}`;
|
|
||||||
|
|
||||||
log(`Verbinde zu RVS: ${proto}://${RVS_HOST}:${RVS_PORT}`);
|
|
||||||
|
|
||||||
const ws = new WebSocket(url);
|
|
||||||
|
|
||||||
ws.on("open", () => {
|
|
||||||
log("RVS verbunden — warte auf TTS-Requests");
|
|
||||||
rvsWs = ws;
|
|
||||||
retryDelay = 2;
|
|
||||||
|
|
||||||
// Keepalive
|
|
||||||
setInterval(() => {
|
|
||||||
if (ws.readyState === WebSocket.OPEN) {
|
|
||||||
ws.ping();
|
|
||||||
ws.send(JSON.stringify({ type: "heartbeat", timestamp: Date.now() }));
|
|
||||||
}
|
|
||||||
}, 25000);
|
|
||||||
});
|
|
||||||
|
|
||||||
ws.on("message", async (raw) => {
|
|
||||||
try {
|
|
||||||
const msg = JSON.parse(raw.toString());
|
|
||||||
|
|
||||||
if (msg.type === "xtts_request") {
|
|
||||||
await handleTTSRequest(msg.payload);
|
|
||||||
} else if (msg.type === "voice_upload") {
|
|
||||||
await handleVoiceUpload(msg.payload);
|
|
||||||
} else if (msg.type === "xtts_list_voices") {
|
|
||||||
await handleListVoices();
|
|
||||||
} else if (msg.type === "xtts_delete_voice") {
|
|
||||||
await handleDeleteVoice(msg.payload);
|
|
||||||
} else if (msg.type === "voice_preload") {
|
|
||||||
await handleVoicePreload(msg.payload);
|
|
||||||
} else if (msg.type === "config") {
|
|
||||||
// Diagnostic hat globale Voice gewechselt → Preload damit der naechste
|
|
||||||
// Render ohne Ladewartezeit startet + alle Clients "voice_ready" sehen
|
|
||||||
const v = msg.payload && msg.payload.xttsVoice;
|
|
||||||
if (v && v !== lastDiagnosticVoice) {
|
|
||||||
lastDiagnosticVoice = v;
|
|
||||||
await handleVoicePreload({ voice: v, source: "diagnostic" });
|
|
||||||
} else if (!v) {
|
|
||||||
lastDiagnosticVoice = "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
log(`Fehler: ${err.message}`);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
ws.on("close", () => {
|
|
||||||
log("RVS Verbindung geschlossen");
|
|
||||||
rvsWs = null;
|
|
||||||
setTimeout(() => connectRVS(), Math.min(retryDelay * 1000, 30000));
|
|
||||||
retryDelay = Math.min(retryDelay * 2, 30);
|
|
||||||
});
|
|
||||||
|
|
||||||
ws.on("error", (err) => {
|
|
||||||
log(`RVS Fehler: ${err.message}`);
|
|
||||||
if (useTls && RVS_TLS_FALLBACK === "true") {
|
|
||||||
log("TLS fehlgeschlagen — Fallback auf ws://");
|
|
||||||
ws.removeAllListeners();
|
|
||||||
try { ws.close(); } catch (_) {}
|
|
||||||
connectRVS(true);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── TTS Request Handler ─────────────────────────────
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Linearer Fade-In auf einen base64-PCM-Chunk (s16le).
|
|
||||||
* Mascht XTTS-Warmup-Glitches am Anfang eines Renders.
|
|
||||||
*/
|
|
||||||
function applyFadeIn(base64Pcm, sampleRate, channels, fadeMs) {
|
|
||||||
const buf = Buffer.from(base64Pcm, "base64");
|
|
||||||
const totalSamples = buf.length / 2; // s16le
|
|
||||||
const fadeSamples = Math.min(
|
|
||||||
Math.floor((fadeMs / 1000) * sampleRate) * channels,
|
|
||||||
totalSamples
|
|
||||||
);
|
|
||||||
for (let i = 0; i < fadeSamples; i++) {
|
|
||||||
const sample = buf.readInt16LE(i * 2);
|
|
||||||
const gain = i / fadeSamples;
|
|
||||||
buf.writeInt16LE(Math.round(sample * gain), i * 2);
|
|
||||||
}
|
|
||||||
return buf.toString("base64");
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── TTS-Queue ──────────────────────────────────────
|
|
||||||
// XTTS verarbeitet Requests sequenziell, damit Streams sich nicht ueberlappen.
|
|
||||||
// Ohne Queue wuerden parallele Requests parallel streamen → App bekommt
|
|
||||||
// interleaved PCM-Chunks aus zwei Rendern → klingt wie Chaos.
|
|
||||||
let ttsQueue = Promise.resolve();
|
|
||||||
|
|
||||||
// Merkt sich die letzte in Diagnostic gewaehlte Voice, damit wir nicht bei jedem
|
|
||||||
// config-Broadcast (auch ohne Aenderung) einen Preload triggern.
|
|
||||||
let lastDiagnosticVoice = "";
|
|
||||||
|
|
||||||
function handleTTSRequest(payload) {
|
|
||||||
ttsQueue = ttsQueue.then(() => _runTTSRequest(payload)).catch(err => {
|
|
||||||
log(`TTS-Queue Fehler: ${err.message}`);
|
|
||||||
});
|
|
||||||
return ttsQueue;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function _runTTSRequest(payload) {
|
|
||||||
const { text, voice, requestId, language, messageId } = payload;
|
|
||||||
if (!text) return;
|
|
||||||
|
|
||||||
// Markdown-Cleanup (Bridge macht jetzt auch Cleanup, aber safety net)
|
|
||||||
let cleanText = text
|
|
||||||
.replace(/\*\*([^*]+)\*\*/g, "$1")
|
|
||||||
.replace(/\*([^*]+)\*/g, "$1")
|
|
||||||
.replace(/`([^`]+)`/g, "$1")
|
|
||||||
.replace(/```[\s\S]*?```/g, "")
|
|
||||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
|
|
||||||
.replace(/#{1,6}\s*/g, "")
|
|
||||||
.replace(/>\s*/g, "")
|
|
||||||
.replace(/[-*]\s+/g, "")
|
|
||||||
.replace(/\n{2,}/g, ". ")
|
|
||||||
.replace(/\n/g, ", ")
|
|
||||||
.replace(/\s{2,}/g, " ")
|
|
||||||
.replace(/["""„]/g, "")
|
|
||||||
.replace(/\(\)/g, "")
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Im local-Mode erwartet daswer123 XTTS speaker_wav als Basename (ohne .wav,
|
|
||||||
// ohne Pfad) — der Server prefixt EXAMPLE_FOLDER selbst. Wir checken hier
|
|
||||||
// nur das physische File ab um Warnungen zu loggen; runter ans API geht
|
|
||||||
// nur der Name.
|
|
||||||
const voiceFilePath = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
|
||||||
const hasCustomVoice = voiceFilePath && fs.existsSync(voiceFilePath);
|
|
||||||
const speakerName = hasCustomVoice ? voice : "";
|
|
||||||
if (voice && !hasCustomVoice) {
|
|
||||||
log(`WARNUNG: Voice "${voice}" angefordert, aber ${voiceFilePath} existiert nicht — nehme Default`);
|
|
||||||
} else if (hasCustomVoice) {
|
|
||||||
log(`Voice "${voice}" verwendet (speaker_wav="${speakerName}")`);
|
|
||||||
}
|
|
||||||
|
|
||||||
let chunkIndex = 0;
|
|
||||||
let pcmMeta = null;
|
|
||||||
let firstChunkSeen = false;
|
|
||||||
|
|
||||||
const onChunk = (pcmBase64, meta) => {
|
|
||||||
if (!pcmMeta) pcmMeta = meta;
|
|
||||||
let outBase64 = pcmBase64;
|
|
||||||
// Fade-In auf den ersten Chunk — maskiert XTTS-Warmup-Glitches
|
|
||||||
// (autoregressiver Generator hat am Anfang wenig Kontext → Artefakte).
|
|
||||||
if (!firstChunkSeen && pcmBase64) {
|
|
||||||
firstChunkSeen = true;
|
|
||||||
outBase64 = applyFadeIn(pcmBase64, meta.sampleRate, meta.channels, 120);
|
|
||||||
}
|
|
||||||
sendToRVS({
|
|
||||||
type: "audio_pcm",
|
|
||||||
payload: {
|
|
||||||
requestId: requestId || "",
|
|
||||||
messageId: messageId || "",
|
|
||||||
base64: outBase64,
|
|
||||||
format: "pcm_s16le",
|
|
||||||
sampleRate: meta.sampleRate,
|
|
||||||
channels: meta.channels,
|
|
||||||
voice: voice || "default",
|
|
||||||
chunk: chunkIndex++,
|
|
||||||
final: false,
|
|
||||||
},
|
|
||||||
timestamp: Date.now(),
|
|
||||||
});
|
|
||||||
};
|
|
||||||
|
|
||||||
// /tts_stream fuer echtes Streaming (funktioniert im XTTS local-Mode).
|
|
||||||
// Wenn Server im apiManual/api-Mode laeuft: 400 → Fallback auf /tts_to_audio/.
|
|
||||||
try {
|
|
||||||
await streamXTTSAsPCM(
|
|
||||||
cleanText,
|
|
||||||
language || "de",
|
|
||||||
speakerName,
|
|
||||||
onChunk,
|
|
||||||
);
|
|
||||||
} catch (streamErr) {
|
|
||||||
log(`/tts_stream fehlgeschlagen (${streamErr.message.slice(0, 100)}) — Fallback /tts_to_audio/`);
|
|
||||||
await streamXTTSBatch(
|
|
||||||
cleanText,
|
|
||||||
language || "de",
|
|
||||||
speakerName,
|
|
||||||
onChunk,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
|
|
||||||
if (pcmMeta) {
|
|
||||||
sendToRVS({
|
|
||||||
type: "audio_pcm",
|
|
||||||
payload: {
|
|
||||||
requestId: requestId || "",
|
|
||||||
messageId: messageId || "",
|
|
||||||
base64: "",
|
|
||||||
format: "pcm_s16le",
|
|
||||||
sampleRate: pcmMeta.sampleRate,
|
|
||||||
channels: pcmMeta.channels,
|
|
||||||
voice: voice || "default",
|
|
||||||
chunk: chunkIndex++,
|
|
||||||
final: true,
|
|
||||||
},
|
|
||||||
timestamp: Date.now(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);
|
|
||||||
} catch (err) {
|
|
||||||
log(`TTS Fehler: ${err.message}`);
|
|
||||||
sendToRVS({
|
|
||||||
type: "xtts_response",
|
|
||||||
payload: { requestId, error: err.message },
|
|
||||||
timestamp: Date.now(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Ruft /tts_stream auf — echter Streaming-Endpoint bei daswer123.
|
|
||||||
* Schickt was der Server verlangt (allow: GET), aber mit JSON-Body
|
|
||||||
* als POST scheitert mit 405. Manche Versionen wollen GET + Query,
|
|
||||||
* andere POST + JSON. Testen was funktioniert.
|
|
||||||
*/
|
|
||||||
function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
// Wichtig: speaker_wav MUSS als Query-Key dabei sein (Pydantic required) —
|
|
||||||
// auch bei default-voice mit leerem Wert. Sonst gibt's HTTP 422.
|
|
||||||
// stream_chunk_size=250: grosse Chunks = wenige Chunk-Grenzen = wenig
|
|
||||||
// Render-Artefakte. daswer123 erzeugt an Chunk-Boundaries haeufig Glitches
|
|
||||||
// in den Worten die ueber die Grenze gehen. Hoehere Latenz ist OK.
|
|
||||||
const qs = new URLSearchParams();
|
|
||||||
qs.set("text", text);
|
|
||||||
qs.set("language", language || "de");
|
|
||||||
qs.set("speaker_wav", speakerWav || "");
|
|
||||||
qs.set("stream_chunk_size", "250");
|
|
||||||
|
|
||||||
const url = new URL(XTTS_API_URL);
|
|
||||||
const fullPath = `/tts_stream?${qs.toString()}`;
|
|
||||||
const options = {
|
|
||||||
hostname: url.hostname,
|
|
||||||
port: url.port || 80,
|
|
||||||
path: fullPath,
|
|
||||||
method: "GET",
|
|
||||||
timeout: 60000,
|
|
||||||
};
|
|
||||||
|
|
||||||
log(`TTS GET /tts_stream?text=${text.slice(0, 30)}... (voice=${speakerWav ? "custom" : "default"})`);
|
|
||||||
|
|
||||||
const req = http.request(options, (res) => {
|
|
||||||
if (res.statusCode !== 200) {
|
|
||||||
let body = "";
|
|
||||||
res.on("data", (d) => { body += d.toString(); });
|
|
||||||
res.on("end", () => {
|
|
||||||
log(`XTTS /tts_stream ${res.statusCode}: ${body.slice(0, 300)}`);
|
|
||||||
reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`));
|
|
||||||
});
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
log(`TTS stream verbunden, empfange PCM...`);
|
|
||||||
|
|
||||||
let headerParsed = false;
|
|
||||||
let sampleRate = 24000;
|
|
||||||
let channels = 1;
|
|
||||||
let leftover = Buffer.alloc(0); // ungerade Byte-Reste fuer das naechste Chunk
|
|
||||||
const HEADER_BYTES = 44;
|
|
||||||
let headerBuf = Buffer.alloc(0);
|
|
||||||
const PCM_CHUNK_BYTES = 8192; // ~170ms bei 24kHz s16 mono
|
|
||||||
|
|
||||||
res.on("data", (chunk) => {
|
|
||||||
let data = chunk;
|
|
||||||
|
|
||||||
// WAV-Header konsumieren (44 Bytes)
|
|
||||||
if (!headerParsed) {
|
|
||||||
headerBuf = Buffer.concat([headerBuf, data]);
|
|
||||||
if (headerBuf.length < HEADER_BYTES) return;
|
|
||||||
// Header lesen
|
|
||||||
const header = headerBuf.slice(0, HEADER_BYTES);
|
|
||||||
try {
|
|
||||||
channels = header.readUInt16LE(22);
|
|
||||||
sampleRate = header.readUInt32LE(24);
|
|
||||||
} catch (_) {}
|
|
||||||
headerParsed = true;
|
|
||||||
data = headerBuf.slice(HEADER_BYTES);
|
|
||||||
}
|
|
||||||
|
|
||||||
// leftover aus vorherigem Chunk + neuer data
|
|
||||||
let combined = Buffer.concat([leftover, data]);
|
|
||||||
|
|
||||||
// In PCM_CHUNK_BYTES-Happen zerlegen (Vielfache von 2 damit keine Sample-Splits)
|
|
||||||
while (combined.length >= PCM_CHUNK_BYTES) {
|
|
||||||
const slice = combined.slice(0, PCM_CHUNK_BYTES);
|
|
||||||
combined = combined.slice(PCM_CHUNK_BYTES);
|
|
||||||
onPcmChunk(slice.toString("base64"), { sampleRate, channels });
|
|
||||||
}
|
|
||||||
leftover = combined;
|
|
||||||
});
|
|
||||||
|
|
||||||
res.on("end", () => {
|
|
||||||
// Rest-Daten senden
|
|
||||||
if (leftover.length > 0) {
|
|
||||||
onPcmChunk(leftover.toString("base64"), { sampleRate, channels });
|
|
||||||
}
|
|
||||||
resolve();
|
|
||||||
});
|
|
||||||
|
|
||||||
res.on("error", reject);
|
|
||||||
});
|
|
||||||
|
|
||||||
req.on("error", reject);
|
|
||||||
req.on("timeout", () => { req.destroy(); reject(new Error("XTTS API Timeout (60s)")); });
|
|
||||||
req.end();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Fallback: /tts_to_audio/ (POST JSON) — rendert komplett, dann response.
|
|
||||||
* Kein echtes Streaming, aber stabil als Backup wenn /tts_stream nicht geht.
|
|
||||||
* Shared chunking-Logik mit streamXTTSAsPCM — parst WAV-Header, stueckelt PCM.
|
|
||||||
*/
|
|
||||||
function streamXTTSBatch(text, language, speakerWav, onPcmChunk) {
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
const body = JSON.stringify({
|
|
||||||
text,
|
|
||||||
language: language || "de",
|
|
||||||
speaker_wav: speakerWav || "",
|
|
||||||
});
|
|
||||||
const url = new URL(XTTS_API_URL);
|
|
||||||
const options = {
|
|
||||||
hostname: url.hostname,
|
|
||||||
port: url.port || 80,
|
|
||||||
path: "/tts_to_audio/",
|
|
||||||
method: "POST",
|
|
||||||
headers: {
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
"Content-Length": Buffer.byteLength(body),
|
|
||||||
},
|
|
||||||
timeout: 60000,
|
|
||||||
};
|
|
||||||
|
|
||||||
const req = http.request(options, (res) => {
|
|
||||||
if (res.statusCode !== 200) {
|
|
||||||
let rb = "";
|
|
||||||
res.on("data", (d) => { rb += d.toString(); });
|
|
||||||
res.on("end", () => reject(new Error(`XTTS Batch HTTP ${res.statusCode}: ${rb.slice(0, 200)}`)));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
let headerParsed = false;
|
|
||||||
let sampleRate = 24000;
|
|
||||||
let channels = 1;
|
|
||||||
let leftover = Buffer.alloc(0);
|
|
||||||
let headerBuf = Buffer.alloc(0);
|
|
||||||
const HEADER_BYTES = 44;
|
|
||||||
const PCM_CHUNK_BYTES = 8192;
|
|
||||||
|
|
||||||
res.on("data", (chunk) => {
|
|
||||||
let data = chunk;
|
|
||||||
if (!headerParsed) {
|
|
||||||
headerBuf = Buffer.concat([headerBuf, data]);
|
|
||||||
if (headerBuf.length < HEADER_BYTES) return;
|
|
||||||
const header = headerBuf.slice(0, HEADER_BYTES);
|
|
||||||
try { channels = header.readUInt16LE(22); sampleRate = header.readUInt32LE(24); } catch (_) {}
|
|
||||||
headerParsed = true;
|
|
||||||
data = headerBuf.slice(HEADER_BYTES);
|
|
||||||
}
|
|
||||||
let combined = Buffer.concat([leftover, data]);
|
|
||||||
while (combined.length >= PCM_CHUNK_BYTES) {
|
|
||||||
const slice = combined.slice(0, PCM_CHUNK_BYTES);
|
|
||||||
combined = combined.slice(PCM_CHUNK_BYTES);
|
|
||||||
onPcmChunk(slice.toString("base64"), { sampleRate, channels });
|
|
||||||
}
|
|
||||||
leftover = combined;
|
|
||||||
});
|
|
||||||
res.on("end", () => {
|
|
||||||
if (leftover.length > 0) onPcmChunk(leftover.toString("base64"), { sampleRate, channels });
|
|
||||||
resolve();
|
|
||||||
});
|
|
||||||
res.on("error", reject);
|
|
||||||
});
|
|
||||||
req.on("error", reject);
|
|
||||||
req.on("timeout", () => { req.destroy(); reject(new Error("XTTS Batch Timeout (60s)")); });
|
|
||||||
req.write(body);
|
|
||||||
req.end();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Voice Upload Handler ────────────────────────────
|
|
||||||
|
|
||||||
async function handleVoiceUpload(payload) {
|
|
||||||
const { name, samples } = payload;
|
|
||||||
if (!name || !samples || !Array.isArray(samples) || samples.length === 0) {
|
|
||||||
log("Voice Upload: Ungueltige Daten");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
log(`Voice Upload: "${name}" (${samples.length} Samples)`);
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Alle Samples zusammenfuegen
|
|
||||||
const buffers = samples.map(s => Buffer.from(s.base64, "base64"));
|
|
||||||
const combined = Buffer.concat(buffers);
|
|
||||||
|
|
||||||
// Als WAV speichern
|
|
||||||
fs.mkdirSync(VOICES_DIR, { recursive: true });
|
|
||||||
const filePath = path.join(VOICES_DIR, `${name.replace(/[^a-zA-Z0-9_-]/g, "_")}.wav`);
|
|
||||||
fs.writeFileSync(filePath, combined);
|
|
||||||
|
|
||||||
log(`Voice gespeichert: ${filePath} (${(combined.length / 1024).toFixed(0)}KB)`);
|
|
||||||
|
|
||||||
sendToRVS({
|
|
||||||
type: "xtts_voice_saved",
|
|
||||||
payload: { name, size: combined.length, path: filePath },
|
|
||||||
timestamp: Date.now(),
|
|
||||||
});
|
|
||||||
} catch (err) {
|
|
||||||
log(`Voice Upload Fehler: ${err.message}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Voice Delete Handler ────────────────────────────
|
|
||||||
|
|
||||||
async function handleDeleteVoice(payload) {
|
|
||||||
const { name } = payload || {};
|
|
||||||
if (!name || typeof name !== "string") {
|
|
||||||
log("Voice Delete: ungueltiger Name");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const safe = name.replace(/[^a-zA-Z0-9_-]/g, "_");
|
|
||||||
const filePath = path.join(VOICES_DIR, `${safe}.wav`);
|
|
||||||
try {
|
|
||||||
if (fs.existsSync(filePath)) {
|
|
||||||
fs.unlinkSync(filePath);
|
|
||||||
log(`Voice geloescht: ${filePath}`);
|
|
||||||
} else {
|
|
||||||
log(`Voice Delete: Datei existiert nicht (${filePath})`);
|
|
||||||
}
|
|
||||||
// Aktualisierte Liste an alle Clients senden
|
|
||||||
await handleListVoices();
|
|
||||||
} catch (err) {
|
|
||||||
log(`Voice Delete Fehler: ${err.message}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Voice List Handler ──────────────────────────────
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Preload einer Stimme — rendert stumm ein kurzes Dummy-Audio, damit XTTS
|
|
||||||
* die Speaker-Latents laedt und der naechste echte Request ohne Wartezeit
|
|
||||||
* loslegen kann. Broadcastet "voice_ready" wenn fertig (oder mit error).
|
|
||||||
*/
|
|
||||||
async function handleVoicePreload(payload) {
|
|
||||||
const voice = (payload && payload.voice) || "";
|
|
||||||
const source = (payload && payload.source) || "unknown";
|
|
||||||
const requestId = (payload && payload.requestId) || "";
|
|
||||||
log(`Voice-Preload angefordert: "${voice}" (source=${source})`);
|
|
||||||
|
|
||||||
try {
|
|
||||||
let speakerName = "";
|
|
||||||
if (voice) {
|
|
||||||
const voiceFilePath = path.join(VOICES_DIR, `${voice}.wav`);
|
|
||||||
if (!fs.existsSync(voiceFilePath)) {
|
|
||||||
sendToRVS({
|
|
||||||
type: "voice_ready",
|
|
||||||
payload: { voice, requestId, error: "voice-file-not-found" },
|
|
||||||
timestamp: Date.now(),
|
|
||||||
});
|
|
||||||
log(`Preload abgebrochen: ${voiceFilePath} existiert nicht`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
speakerName = voice;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dummy-Request via Queue — damit sich Preload nicht mit echtem TTS ueberholt.
|
|
||||||
const t0 = Date.now();
|
|
||||||
await new Promise((resolve, reject) => {
|
|
||||||
ttsQueue = ttsQueue.then(async () => {
|
|
||||||
try {
|
|
||||||
await streamXTTSAsPCM("ja.", "de", speakerName, () => {});
|
|
||||||
resolve();
|
|
||||||
} catch (err) {
|
|
||||||
reject(err);
|
|
||||||
}
|
|
||||||
}).catch(reject);
|
|
||||||
});
|
|
||||||
const ms = Date.now() - t0;
|
|
||||||
log(`Voice "${voice || "default"}" geladen in ${ms}ms`);
|
|
||||||
|
|
||||||
sendToRVS({
|
|
||||||
type: "voice_ready",
|
|
||||||
payload: { voice, requestId, loadMs: ms },
|
|
||||||
timestamp: Date.now(),
|
|
||||||
});
|
|
||||||
} catch (err) {
|
|
||||||
log(`Voice-Preload Fehler: ${err.message}`);
|
|
||||||
sendToRVS({
|
|
||||||
type: "voice_ready",
|
|
||||||
payload: { voice, requestId, error: err.message.slice(0, 200) },
|
|
||||||
timestamp: Date.now(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function handleListVoices() {
|
|
||||||
try {
|
|
||||||
const files = fs.existsSync(VOICES_DIR)
|
|
||||||
? fs.readdirSync(VOICES_DIR).filter(f => f.endsWith(".wav"))
|
|
||||||
: [];
|
|
||||||
|
|
||||||
const voices = files.map(f => ({
|
|
||||||
name: path.basename(f, ".wav"),
|
|
||||||
file: f,
|
|
||||||
size: fs.statSync(path.join(VOICES_DIR, f)).size,
|
|
||||||
}));
|
|
||||||
|
|
||||||
log(`Stimmen: ${voices.length} verfuegbar`);
|
|
||||||
|
|
||||||
sendToRVS({
|
|
||||||
type: "xtts_voices_list",
|
|
||||||
payload: { voices },
|
|
||||||
timestamp: Date.now(),
|
|
||||||
});
|
|
||||||
} catch (err) {
|
|
||||||
log(`Stimmen-Liste Fehler: ${err.message}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── RVS senden ──────────────────────────────────────
|
|
||||||
|
|
||||||
function sendToRVS(msg) {
|
|
||||||
if (rvsWs && rvsWs.readyState === WebSocket.OPEN) {
|
|
||||||
rvsWs.send(JSON.stringify(msg));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Start ───────────────────────────────────────────
|
|
||||||
|
|
||||||
log("ARIA XTTS Bridge startet...");
|
|
||||||
log(`XTTS API: ${XTTS_API_URL}`);
|
|
||||||
log(`RVS: ${RVS_HOST}:${RVS_PORT}`);
|
|
||||||
|
|
||||||
// Warten bis XTTS API erreichbar ist
|
|
||||||
function waitForXTTS(callback, attempts) {
|
|
||||||
if (attempts <= 0) { log("XTTS API nicht erreichbar — starte trotzdem"); callback(); return; }
|
|
||||||
http.get(`${XTTS_API_URL}/docs`, (res) => {
|
|
||||||
log(`XTTS API erreichbar (HTTP ${res.statusCode})`);
|
|
||||||
callback();
|
|
||||||
}).on("error", () => {
|
|
||||||
log(`XTTS API noch nicht bereit — warte (${attempts} Versuche uebrig)...`);
|
|
||||||
setTimeout(() => waitForXTTS(callback, attempts - 1), 10000); // 10s statt 5s (Model laden dauert)
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
waitForXTTS(() => connectRVS(), 30); // Max 5min warten
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
# ════════════════════════════════════════════════
|
# ════════════════════════════════════════════════
|
||||||
# ARIA XTTS v2 — GPU TTS Server
|
# ARIA Gamebox Stack — GPU F5-TTS + Whisper STT
|
||||||
# Laeuft auf dem Gaming-PC (RTX 3060)
|
# Laeuft auf dem Gaming-PC (RTX 3060)
|
||||||
# Verbindet sich zum RVS fuer TTS-Requests
|
# Verbindet sich zum RVS fuer TTS/STT-Requests
|
||||||
# ════════════════════════════════════════════════
|
# ════════════════════════════════════════════════
|
||||||
#
|
#
|
||||||
# Voraussetzungen:
|
# Voraussetzungen:
|
||||||
|
|
@ -10,15 +10,18 @@
|
||||||
# - .env mit RVS-Verbindungsdaten
|
# - .env mit RVS-Verbindungsdaten
|
||||||
#
|
#
|
||||||
# Start: docker compose up -d
|
# Start: docker compose up -d
|
||||||
# Test: curl http://localhost:8000/docs
|
|
||||||
# ════════════════════════════════════════════════
|
# ════════════════════════════════════════════════
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
# ─── XTTS v2 API Server (GPU) ─────────────────
|
# ─── F5-TTS Bridge (GPU) ──────────────────────
|
||||||
xtts:
|
# Ersetzt den frueheren XTTS-Stack. Empfaengt xtts_request via RVS,
|
||||||
image: daswer123/xtts-api-server:latest
|
# rendert via F5-TTS mit Voice-Cloning, streamt PCM an die App.
|
||||||
container_name: aria-xtts
|
# Voice-Upload: speichert WAV und laesst whisper-bridge den Referenz-
|
||||||
|
# text transkribieren — der User muss nichts eintippen.
|
||||||
|
f5tts-bridge:
|
||||||
|
build: ./f5tts
|
||||||
|
container_name: aria-f5tts-bridge
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
|
|
@ -26,45 +29,29 @@ services:
|
||||||
- driver: nvidia
|
- driver: nvidia
|
||||||
count: 1
|
count: 1
|
||||||
capabilities: [gpu]
|
capabilities: [gpu]
|
||||||
ports:
|
|
||||||
- "8000:8020"
|
|
||||||
volumes:
|
volumes:
|
||||||
- xtts-models:/app/xtts_models # Model-Cache (~2GB)
|
- ./voices:/voices # WAV + TXT Referenz
|
||||||
- ./voices:/voices # Custom Voice Samples
|
- f5tts-models:/root/.cache/huggingface # Model-Cache persistieren
|
||||||
environment:
|
environment:
|
||||||
- COQUI_TOS_AGREED=1
|
|
||||||
# Local-Modus statt default "apiManual": Modell bleibt im GPU-VRAM,
|
|
||||||
# Render startet sofort, /tts_stream funktioniert.
|
|
||||||
# Default-CMD des Images liest diese ENV: -ms ${MODEL_SOURCE:-"apiManual"}
|
|
||||||
- MODEL_SOURCE=local
|
|
||||||
# Speaker-Folder auf unsere gemounteten voices zeigen lassen
|
|
||||||
- EXAMPLE_FOLDER=/voices
|
|
||||||
restart: unless-stopped
|
|
||||||
|
|
||||||
# ─── XTTS Bridge (verbindet zu RVS) ───────────
|
|
||||||
xtts-bridge:
|
|
||||||
build: .
|
|
||||||
container_name: aria-xtts-bridge
|
|
||||||
depends_on:
|
|
||||||
- xtts
|
|
||||||
volumes:
|
|
||||||
- ./voices:/voices # Shared mit XTTS-Server
|
|
||||||
environment:
|
|
||||||
- XTTS_API_URL=http://xtts:8020
|
|
||||||
- RVS_HOST=${RVS_HOST}
|
- RVS_HOST=${RVS_HOST}
|
||||||
- RVS_PORT=${RVS_PORT:-443}
|
- RVS_PORT=${RVS_PORT:-443}
|
||||||
- RVS_TLS=${RVS_TLS:-true}
|
- RVS_TLS=${RVS_TLS:-true}
|
||||||
- RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true}
|
- RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true}
|
||||||
- RVS_TOKEN=${RVS_TOKEN}
|
- RVS_TOKEN=${RVS_TOKEN}
|
||||||
|
- F5TTS_MODEL=${F5TTS_MODEL:-F5TTS_v1_Base}
|
||||||
|
- F5TTS_DEVICE=${F5TTS_DEVICE:-cuda}
|
||||||
|
- VOICES_DIR=/voices
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
# ─── Whisper STT (GPU) ────────────────────────
|
# ─── Whisper STT (GPU) ────────────────────────
|
||||||
# Faster-Whisper auf der Gamebox statt auf der VM (CPU) —
|
# Faster-Whisper auf der Gamebox statt auf der VM (CPU) —
|
||||||
# deutlich schneller. Verbindet sich selbst per WebSocket an
|
# deutlich schneller. Verbindet sich selbst per WebSocket an
|
||||||
# den RVS und nimmt dort stt_request Nachrichten der aria-bridge
|
# den RVS und nimmt dort stt_request Nachrichten der aria-bridge
|
||||||
# entgegen, antwortet mit stt_response. Laedt das Modell beim
|
# entgegen, antwortet mit stt_response. Zusaetzlich nutzt die
|
||||||
# Start vor; auf Config-Broadcasts (Diagnostic → whisperModel)
|
# f5tts-bridge Whisper intern fuer die Referenz-Transkription bei
|
||||||
# wird zur Laufzeit hot-swapped.
|
# Voice-Uploads. Laedt das Modell beim Start vor; auf Config-
|
||||||
|
# Broadcasts (Diagnostic → whisperModel) wird zur Laufzeit hot-
|
||||||
|
# swapped.
|
||||||
whisper-bridge:
|
whisper-bridge:
|
||||||
build: ./whisper
|
build: ./whisper
|
||||||
container_name: aria-whisper-bridge
|
container_name: aria-whisper-bridge
|
||||||
|
|
@ -86,9 +73,9 @@ services:
|
||||||
- WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
|
- WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
|
||||||
- WHISPER_LANGUAGE=${WHISPER_LANGUAGE:-de}
|
- WHISPER_LANGUAGE=${WHISPER_LANGUAGE:-de}
|
||||||
volumes:
|
volumes:
|
||||||
- whisper-models:/root/.cache/huggingface # Model-Cache persistieren
|
- whisper-models:/root/.cache/huggingface
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
xtts-models:
|
f5tts-models:
|
||||||
whisper-models:
|
whisper-models:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip ffmpeg git \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# PyTorch CUDA-Wheels zuerst (f5-tts zieht sonst CPU-only Torch rein)
|
||||||
|
RUN pip3 install --no-cache-dir torch==2.3.1 torchaudio==2.3.1 \
|
||||||
|
--index-url https://download.pytorch.org/whl/cu121
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY bridge.py .
|
||||||
|
|
||||||
|
CMD ["python3", "bridge.py"]
|
||||||
|
|
@ -0,0 +1,580 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
ARIA F5-TTS Bridge — laeuft auf der Gamebox (RTX 3060).
|
||||||
|
|
||||||
|
Empfaengt xtts_request via RVS → F5-TTS Voice Cloning auf GPU → streamt
|
||||||
|
16-bit PCM Chunks als audio_pcm Nachrichten zurueck an die App.
|
||||||
|
|
||||||
|
Voice-Layout im VOICES_DIR:
|
||||||
|
{name}.wav — Referenz-Audio (6-10s, 24kHz mono empfohlen)
|
||||||
|
{name}.txt — Referenz-Text (UTF-8, was im WAV gesprochen wird)
|
||||||
|
|
||||||
|
Beim voice_upload senden wir intern einen stt_request an die whisper-bridge
|
||||||
|
und legen die Transkription als .txt ab — der User muss keinen Text eingeben.
|
||||||
|
|
||||||
|
Env:
|
||||||
|
RVS_HOST, RVS_PORT, RVS_TLS, RVS_TLS_FALLBACK, RVS_TOKEN
|
||||||
|
F5TTS_MODEL Default: F5TTS_v1_Base
|
||||||
|
F5TTS_DEVICE Default: cuda
|
||||||
|
VOICES_DIR Default: /voices
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import soundfile as sf
|
||||||
|
import websockets
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
|
datefmt="%H:%M:%S",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger("f5tts-bridge")
|
||||||
|
# HuggingFace + Torch download-Logs etwas daempfen
|
||||||
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||||
|
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
RVS_HOST = os.getenv("RVS_HOST", "").strip()
|
||||||
|
RVS_PORT = int(os.getenv("RVS_PORT", "443"))
|
||||||
|
RVS_TLS = os.getenv("RVS_TLS", "true").lower() == "true"
|
||||||
|
RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true").lower() == "true"
|
||||||
|
RVS_TOKEN = os.getenv("RVS_TOKEN", "").strip()
|
||||||
|
|
||||||
|
F5TTS_MODEL = os.getenv("F5TTS_MODEL", "F5TTS_v1_Base")
|
||||||
|
F5TTS_DEVICE = os.getenv("F5TTS_DEVICE", "cuda")
|
||||||
|
VOICES_DIR = Path(os.getenv("VOICES_DIR", "/voices"))
|
||||||
|
|
||||||
|
PCM_CHUNK_BYTES = 8192 # ~170ms @ 24kHz mono s16
|
||||||
|
TARGET_SR = 24000 # F5-TTS native
|
||||||
|
|
||||||
|
# ── Lazy F5-TTS Loader ──────────────────────────────────────
|
||||||
|
|
||||||
|
_F5TTS_cls = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_f5tts_cls():
|
||||||
|
"""Lazy import damit Startup-Logs nicht durch Torch-Warnungen zumuellen."""
|
||||||
|
global _F5TTS_cls
|
||||||
|
if _F5TTS_cls is None:
|
||||||
|
from f5_tts.api import F5TTS as _cls
|
||||||
|
_F5TTS_cls = _cls
|
||||||
|
return _F5TTS_cls
|
||||||
|
|
||||||
|
|
||||||
|
class F5Runner:
|
||||||
|
"""Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking)."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.model = None
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
|
||||||
|
def _load_blocking(self) -> None:
|
||||||
|
cls = _get_f5tts_cls()
|
||||||
|
logger.info("Lade F5-TTS '%s' (device=%s)...", F5TTS_MODEL, F5TTS_DEVICE)
|
||||||
|
t0 = time.time()
|
||||||
|
self.model = cls(model=F5TTS_MODEL, device=F5TTS_DEVICE)
|
||||||
|
logger.info("F5-TTS geladen in %.1fs", time.time() - t0)
|
||||||
|
|
||||||
|
async def ensure_loaded(self) -> None:
|
||||||
|
async with self._lock:
|
||||||
|
if self.model is not None:
|
||||||
|
return
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
await loop.run_in_executor(None, self._load_blocking)
|
||||||
|
|
||||||
|
def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
|
||||||
|
wav, sr, _ = self.model.infer(
|
||||||
|
ref_file=ref_wav,
|
||||||
|
ref_text=ref_text,
|
||||||
|
gen_text=gen_text,
|
||||||
|
remove_silence=True,
|
||||||
|
seed=-1,
|
||||||
|
)
|
||||||
|
# F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard
|
||||||
|
if not isinstance(wav, np.ndarray):
|
||||||
|
wav = np.asarray(wav, dtype=np.float32)
|
||||||
|
if wav.ndim > 1:
|
||||||
|
wav = wav.squeeze()
|
||||||
|
return wav.astype(np.float32), int(sr)
|
||||||
|
|
||||||
|
async def synthesize(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
|
||||||
|
await self.ensure_loaded()
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
return await loop.run_in_executor(None, self._infer_blocking, gen_text, ref_wav, ref_text)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Helpers ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+")
|
||||||
|
|
||||||
|
|
||||||
|
def split_sentences(text: str, max_len: int = 350) -> list[str]:
|
||||||
|
"""Teilt langen Text an Satzgrenzen. Kurze Texte bleiben als-is."""
|
||||||
|
text = text.strip()
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
if len(text) <= max_len:
|
||||||
|
return [text]
|
||||||
|
parts = [p.strip() for p in _SENTENCE_SPLIT.split(text) if p.strip()]
|
||||||
|
# Zu kurze Fragmente mergen damit F5-TTS nicht an jedem Komma neu startet
|
||||||
|
merged: list[str] = []
|
||||||
|
buf = ""
|
||||||
|
for p in parts:
|
||||||
|
if len(buf) + len(p) + 1 <= max_len:
|
||||||
|
buf = f"{buf} {p}".strip()
|
||||||
|
else:
|
||||||
|
if buf:
|
||||||
|
merged.append(buf)
|
||||||
|
buf = p
|
||||||
|
if buf:
|
||||||
|
merged.append(buf)
|
||||||
|
return merged or [text]
|
||||||
|
|
||||||
|
|
||||||
|
def float_to_pcm16(wav: np.ndarray) -> bytes:
|
||||||
|
"""Float32 (-1..+1) → int16 little-endian bytes."""
|
||||||
|
wav = np.clip(wav, -1.0, 1.0)
|
||||||
|
pcm = (wav * 32767.0).astype(np.int16)
|
||||||
|
return pcm.tobytes()
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_voice_name(name: str) -> str:
|
||||||
|
return re.sub(r"[^a-zA-Z0-9_-]", "_", name)
|
||||||
|
|
||||||
|
|
||||||
|
def voice_paths(name: str) -> tuple[Path, Path]:
|
||||||
|
safe = sanitize_voice_name(name)
|
||||||
|
return VOICES_DIR / f"{safe}.wav", VOICES_DIR / f"{safe}.txt"
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_24k_mono_wav(src_wav: Path) -> Path:
|
||||||
|
"""F5-TTS moechte 24kHz mono als Referenz — ffmpeg konvertiert inplace.
|
||||||
|
|
||||||
|
Wenn das File schon passt, wird nichts geaendert. Sonst wird es
|
||||||
|
reingeschrieben (Original wird ueberschrieben).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
info = sf.info(str(src_wav))
|
||||||
|
if info.samplerate == TARGET_SR and info.channels == 1:
|
||||||
|
return src_wav
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
tmp_out = src_wav.with_suffix(".conv.wav")
|
||||||
|
cmd = ["ffmpeg", "-y", "-i", str(src_wav),
|
||||||
|
"-ar", str(TARGET_SR), "-ac", "1", "-f", "wav", str(tmp_out)]
|
||||||
|
r = subprocess.run(cmd, capture_output=True, timeout=30)
|
||||||
|
if r.returncode != 0:
|
||||||
|
logger.warning("ffmpeg-Konvertierung von %s fehlgeschlagen: %s",
|
||||||
|
src_wav, r.stderr.decode(errors="replace")[:200])
|
||||||
|
try:
|
||||||
|
tmp_out.unlink()
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
return src_wav
|
||||||
|
os.replace(tmp_out, src_wav)
|
||||||
|
return src_wav
|
||||||
|
|
||||||
|
|
||||||
|
async def _send(ws, mtype: str, payload: dict) -> None:
|
||||||
|
try:
|
||||||
|
await ws.send(json.dumps({
|
||||||
|
"type": mtype,
|
||||||
|
"payload": payload,
|
||||||
|
"timestamp": int(time.time() * 1000),
|
||||||
|
}))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Send fehlgeschlagen (%s): %s", mtype, e)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Interne Transkription via whisper-bridge ────────────────
|
||||||
|
|
||||||
|
_pending_stt: dict[str, asyncio.Future] = {}
|
||||||
|
_STT_TIMEOUT_S = 60.0
|
||||||
|
|
||||||
|
|
||||||
|
async def request_transcription(ws, wav_path: Path, language: str = "de") -> Optional[str]:
|
||||||
|
"""Sendet einen stt_request an die whisper-bridge (ueber RVS) und wartet auf stt_response."""
|
||||||
|
try:
|
||||||
|
with open(wav_path, "rb") as f:
|
||||||
|
audio_b64 = base64.b64encode(f.read()).decode("ascii")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Lesen %s fehlgeschlagen: %s", wav_path, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
request_id = str(uuid.uuid4())
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
fut: asyncio.Future = loop.create_future()
|
||||||
|
_pending_stt[request_id] = fut
|
||||||
|
|
||||||
|
try:
|
||||||
|
await _send(ws, "stt_request", {
|
||||||
|
"requestId": request_id,
|
||||||
|
"audio": audio_b64,
|
||||||
|
"mimeType": "audio/wav",
|
||||||
|
"model": "small", # klein reicht fuer Voice-Referenz
|
||||||
|
"language": language,
|
||||||
|
})
|
||||||
|
return await asyncio.wait_for(fut, timeout=_STT_TIMEOUT_S)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("Transkription Timeout fuer %s", wav_path.name)
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Transkription Fehler: %s", e)
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
_pending_stt.pop(request_id, None)
|
||||||
|
|
||||||
|
|
||||||
|
# ── TTS-Request Handler ─────────────────────────────────────
|
||||||
|
|
||||||
|
# Queue damit sich parallele Requests nicht ueberlappen (GPU-Throughput)
|
||||||
|
_tts_queue: asyncio.Queue[tuple] = asyncio.Queue()
|
||||||
|
|
||||||
|
|
||||||
|
async def _tts_worker(ws, runner: F5Runner) -> None:
|
||||||
|
"""Serialisiert Synthesen — GPU kann sonst OOM gehen."""
|
||||||
|
while True:
|
||||||
|
text, voice, request_id, message_id, language = await _tts_queue.get()
|
||||||
|
try:
|
||||||
|
await _do_tts(ws, runner, text, voice, request_id, message_id, language)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("TTS-Worker Fehler")
|
||||||
|
finally:
|
||||||
|
_tts_queue.task_done()
|
||||||
|
|
||||||
|
|
||||||
|
async def _do_tts(ws, runner: F5Runner, text: str, voice: str,
|
||||||
|
request_id: str, message_id: str, language: str) -> None:
|
||||||
|
t0 = time.time()
|
||||||
|
ref_wav_path, ref_txt_path = voice_paths(voice) if voice else (None, None)
|
||||||
|
has_custom = bool(voice and ref_wav_path and ref_wav_path.exists() and ref_txt_path.exists())
|
||||||
|
if voice and not has_custom:
|
||||||
|
# Wenn nur WAV da ist aber kein txt → on-the-fly transkribieren
|
||||||
|
if ref_wav_path and ref_wav_path.exists() and (not ref_txt_path or not ref_txt_path.exists()):
|
||||||
|
logger.info("Voice '%s' hat kein txt — transkribiere on-the-fly", voice)
|
||||||
|
text_ref = await request_transcription(ws, ref_wav_path, language)
|
||||||
|
if text_ref:
|
||||||
|
try:
|
||||||
|
ref_txt_path.write_text(text_ref.strip(), encoding="utf-8")
|
||||||
|
has_custom = True
|
||||||
|
logger.info("Referenz-Text nachgezogen: '%s'", text_ref[:60])
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Referenz-Text speichern fehlgeschlagen: %s", e)
|
||||||
|
if not has_custom:
|
||||||
|
logger.warning("Voice '%s' nicht komplett (%s, txt=%s) — nehme Default",
|
||||||
|
voice, ref_wav_path, (ref_txt_path and ref_txt_path.exists()))
|
||||||
|
|
||||||
|
if has_custom:
|
||||||
|
ref_wav_str = str(ref_wav_path)
|
||||||
|
ref_text = ref_txt_path.read_text(encoding="utf-8").strip()
|
||||||
|
else:
|
||||||
|
# Fallback: kein Custom-Voice. F5-TTS braucht IMMER eine Referenz,
|
||||||
|
# wir nehmen default_ref.wav/txt falls vorhanden, sonst die erste
|
||||||
|
# gefundene Voice im Ordner.
|
||||||
|
default_wav = VOICES_DIR / "default_ref.wav"
|
||||||
|
default_txt = VOICES_DIR / "default_ref.txt"
|
||||||
|
if default_wav.exists() and default_txt.exists():
|
||||||
|
ref_wav_str = str(default_wav)
|
||||||
|
ref_text = default_txt.read_text(encoding="utf-8").strip()
|
||||||
|
else:
|
||||||
|
# Nimm irgendein vorhandenes voice-Paar
|
||||||
|
pair = next(
|
||||||
|
((w, t) for w, t in (
|
||||||
|
(v, v.with_suffix(".txt")) for v in VOICES_DIR.glob("*.wav")
|
||||||
|
) if t.exists()),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
if not pair:
|
||||||
|
logger.error("Keine Referenz-Stimme im VOICES_DIR — TTS abgebrochen")
|
||||||
|
return
|
||||||
|
ref_wav_str, ref_text = str(pair[0]), pair[1].read_text(encoding="utf-8").strip()
|
||||||
|
|
||||||
|
sentences = split_sentences(text)
|
||||||
|
logger.info("F5-TTS: %d Satz(e), voice=%s (%s)", len(sentences), voice or "default", ref_wav_str)
|
||||||
|
|
||||||
|
chunk_index = 0
|
||||||
|
pcm_sr = TARGET_SR
|
||||||
|
for i, sent in enumerate(sentences):
|
||||||
|
try:
|
||||||
|
wav, sr = await runner.synthesize(sent, ref_wav_str, ref_text)
|
||||||
|
pcm_sr = sr
|
||||||
|
pcm_bytes = float_to_pcm16(wav)
|
||||||
|
# Erste PCM-Chunk des allerersten Satzes bekommt Fade-In (maskiert
|
||||||
|
# eventuelle Warmup-Glitches). Alle anderen Chunks bleiben wie sind.
|
||||||
|
if i == 0 and chunk_index == 0:
|
||||||
|
pcm_bytes = _fade_in_pcm16(pcm_bytes, sr, 120)
|
||||||
|
|
||||||
|
# Stueckeln
|
||||||
|
for off in range(0, len(pcm_bytes), PCM_CHUNK_BYTES):
|
||||||
|
slice_ = pcm_bytes[off:off + PCM_CHUNK_BYTES]
|
||||||
|
await _send(ws, "audio_pcm", {
|
||||||
|
"requestId": request_id,
|
||||||
|
"messageId": message_id,
|
||||||
|
"base64": base64.b64encode(slice_).decode("ascii"),
|
||||||
|
"format": "pcm_s16le",
|
||||||
|
"sampleRate": sr,
|
||||||
|
"channels": 1,
|
||||||
|
"voice": voice or "default",
|
||||||
|
"chunk": chunk_index,
|
||||||
|
"final": False,
|
||||||
|
})
|
||||||
|
chunk_index += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("F5-TTS Synthese-Fehler (Satz %d)", i)
|
||||||
|
await _send(ws, "xtts_response", {
|
||||||
|
"requestId": request_id,
|
||||||
|
"error": str(e)[:200],
|
||||||
|
})
|
||||||
|
return
|
||||||
|
|
||||||
|
# Final-Marker
|
||||||
|
await _send(ws, "audio_pcm", {
|
||||||
|
"requestId": request_id,
|
||||||
|
"messageId": message_id,
|
||||||
|
"base64": "",
|
||||||
|
"format": "pcm_s16le",
|
||||||
|
"sampleRate": pcm_sr,
|
||||||
|
"channels": 1,
|
||||||
|
"voice": voice or "default",
|
||||||
|
"chunk": chunk_index,
|
||||||
|
"final": True,
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info("TTS komplett: %d Chunks, %.2fs render (voice=%s, text=%d chars)",
|
||||||
|
chunk_index, time.time() - t0, voice or "default", len(text))
|
||||||
|
|
||||||
|
|
||||||
|
def _fade_in_pcm16(pcm: bytes, sr: int, fade_ms: int) -> bytes:
|
||||||
|
"""Linear Fade-In auf erste fade_ms — maskiert Warmup-Glitches."""
|
||||||
|
arr = np.frombuffer(pcm, dtype=np.int16).copy()
|
||||||
|
fade_samples = min(int((fade_ms / 1000.0) * sr), len(arr))
|
||||||
|
if fade_samples <= 0:
|
||||||
|
return pcm
|
||||||
|
ramp = np.linspace(0.0, 1.0, fade_samples, dtype=np.float32)
|
||||||
|
arr[:fade_samples] = (arr[:fade_samples].astype(np.float32) * ramp).astype(np.int16)
|
||||||
|
return arr.tobytes()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Voice Management Handlers ───────────────────────────────
|
||||||
|
|
||||||
|
async def handle_voice_upload(ws, payload: dict) -> None:
|
||||||
|
name = (payload.get("name") or "").strip()
|
||||||
|
samples = payload.get("samples") or []
|
||||||
|
if not name or not samples:
|
||||||
|
logger.warning("voice_upload: ungueltig (name=%r, samples=%d)", name, len(samples))
|
||||||
|
return
|
||||||
|
logger.info("Voice-Upload: '%s' (%d Samples)", name, len(samples))
|
||||||
|
|
||||||
|
try:
|
||||||
|
VOICES_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
safe = sanitize_voice_name(name)
|
||||||
|
wav_path = VOICES_DIR / f"{safe}.wav"
|
||||||
|
txt_path = VOICES_DIR / f"{safe}.txt"
|
||||||
|
|
||||||
|
# Samples zusammenfuegen
|
||||||
|
buffers = [base64.b64decode(s.get("base64", "")) for s in samples]
|
||||||
|
with open(wav_path, "wb") as f:
|
||||||
|
for b in buffers:
|
||||||
|
f.write(b)
|
||||||
|
size_kb = wav_path.stat().st_size / 1024
|
||||||
|
logger.info("Voice WAV gespeichert: %s (%.0fKB)", wav_path, size_kb)
|
||||||
|
|
||||||
|
# Auf 24kHz mono normalisieren (falls App in anderem Format liefert)
|
||||||
|
ensure_24k_mono_wav(wav_path)
|
||||||
|
|
||||||
|
# Transkription ueber whisper-bridge anfragen
|
||||||
|
logger.info("Transkribiere '%s' via whisper-bridge...", name)
|
||||||
|
text = await request_transcription(ws, wav_path, language="de")
|
||||||
|
if not text:
|
||||||
|
logger.warning("Transkription fehlgeschlagen — speichere Platzhalter-Text")
|
||||||
|
text = "Das ist ein Referenz Audio."
|
||||||
|
txt_path.write_text(text.strip(), encoding="utf-8")
|
||||||
|
logger.info("Voice '%s' komplett (txt: %s)", name, text[:80])
|
||||||
|
|
||||||
|
await _send(ws, "xtts_voice_saved", {
|
||||||
|
"name": name, "size": int(size_kb * 1024), "refText": text.strip(),
|
||||||
|
})
|
||||||
|
# Liste aktualisieren
|
||||||
|
await handle_list_voices(ws)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("voice_upload Fehler")
|
||||||
|
await _send(ws, "xtts_voice_saved", {"name": name, "error": str(e)[:200]})
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_list_voices(ws) -> None:
|
||||||
|
try:
|
||||||
|
voices = []
|
||||||
|
if VOICES_DIR.exists():
|
||||||
|
for wav in sorted(VOICES_DIR.glob("*.wav")):
|
||||||
|
txt = wav.with_suffix(".txt")
|
||||||
|
voices.append({
|
||||||
|
"name": wav.stem,
|
||||||
|
"file": wav.name,
|
||||||
|
"size": wav.stat().st_size,
|
||||||
|
"hasRefText": txt.exists(),
|
||||||
|
})
|
||||||
|
logger.info("Stimmen-Liste: %d", len(voices))
|
||||||
|
await _send(ws, "xtts_voices_list", {"voices": voices})
|
||||||
|
except Exception:
|
||||||
|
logger.exception("handle_list_voices Fehler")
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_delete_voice(ws, payload: dict) -> None:
|
||||||
|
name = (payload.get("name") or "").strip()
|
||||||
|
if not name:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
wav, txt = voice_paths(name)
|
||||||
|
for p in (wav, txt):
|
||||||
|
if p.exists():
|
||||||
|
p.unlink()
|
||||||
|
logger.info("Voice geloescht: %s", p)
|
||||||
|
await handle_list_voices(ws)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("handle_delete_voice Fehler")
|
||||||
|
|
||||||
|
|
||||||
|
# Letzte diagnostisch-gesetzte Voice (verhindert Endlos-Preload bei jedem config)
|
||||||
|
_last_diag_voice = ""
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_voice_preload(ws, payload: dict, runner: F5Runner) -> None:
|
||||||
|
voice = (payload.get("voice") or "").strip()
|
||||||
|
request_id = payload.get("requestId", "")
|
||||||
|
logger.info("Voice-Preload angefordert: '%s'", voice or "default")
|
||||||
|
|
||||||
|
try:
|
||||||
|
ref_wav, ref_txt = voice_paths(voice) if voice else (None, None)
|
||||||
|
if voice and (not ref_wav or not ref_wav.exists()):
|
||||||
|
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "error": "voice-file-not-found"})
|
||||||
|
return
|
||||||
|
|
||||||
|
# Ref-Text sicherstellen (falls nur WAV da ist)
|
||||||
|
if voice and ref_txt and not ref_txt.exists():
|
||||||
|
text = await request_transcription(ws, ref_wav, language="de")
|
||||||
|
if text:
|
||||||
|
ref_txt.write_text(text.strip(), encoding="utf-8")
|
||||||
|
logger.info("Referenz-Text beim Preload nachgezogen")
|
||||||
|
|
||||||
|
# Dummy-Render zum Warmup
|
||||||
|
t0 = time.time()
|
||||||
|
await _do_tts(ws, runner, "ja.", voice, f"preload-{request_id}", "", "de")
|
||||||
|
ms = int((time.time() - t0) * 1000)
|
||||||
|
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "loadMs": ms})
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Voice-Preload Fehler")
|
||||||
|
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "error": str(e)[:200]})
|
||||||
|
|
||||||
|
|
||||||
|
# ── Haupt-Loop ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def run_loop(runner: F5Runner) -> None:
|
||||||
|
# Preload im Hintergrund starten damit der Startup nicht blockiert
|
||||||
|
asyncio.create_task(runner.ensure_loaded())
|
||||||
|
|
||||||
|
use_tls = RVS_TLS
|
||||||
|
retry_s = 2
|
||||||
|
tls_fallback_tried = False
|
||||||
|
global _last_diag_voice
|
||||||
|
|
||||||
|
while True:
|
||||||
|
scheme = "wss" if use_tls else "ws"
|
||||||
|
url = f"{scheme}://{RVS_HOST}:{RVS_PORT}/ws?token={RVS_TOKEN}"
|
||||||
|
masked = url.replace(RVS_TOKEN, "***") if RVS_TOKEN else url
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info("Verbinde zu RVS: %s", masked)
|
||||||
|
async with websockets.connect(url, ping_interval=20, ping_timeout=10, max_size=50 * 1024 * 1024) as ws:
|
||||||
|
logger.info("RVS verbunden")
|
||||||
|
retry_s = 2
|
||||||
|
tls_fallback_tried = False
|
||||||
|
|
||||||
|
# TTS-Worker fuer diese Verbindung starten
|
||||||
|
worker = asyncio.create_task(_tts_worker(ws, runner))
|
||||||
|
|
||||||
|
try:
|
||||||
|
async for raw in ws:
|
||||||
|
try:
|
||||||
|
msg = json.loads(raw)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
mtype = msg.get("type", "")
|
||||||
|
payload = msg.get("payload", {}) or {}
|
||||||
|
|
||||||
|
if mtype == "xtts_request":
|
||||||
|
await _tts_queue.put((
|
||||||
|
payload.get("text", ""),
|
||||||
|
payload.get("voice", "") or "",
|
||||||
|
payload.get("requestId", ""),
|
||||||
|
payload.get("messageId", ""),
|
||||||
|
payload.get("language", "de"),
|
||||||
|
))
|
||||||
|
elif mtype == "voice_upload":
|
||||||
|
asyncio.create_task(handle_voice_upload(ws, payload))
|
||||||
|
elif mtype == "xtts_list_voices":
|
||||||
|
asyncio.create_task(handle_list_voices(ws))
|
||||||
|
elif mtype == "xtts_delete_voice":
|
||||||
|
asyncio.create_task(handle_delete_voice(ws, payload))
|
||||||
|
elif mtype == "voice_preload":
|
||||||
|
asyncio.create_task(handle_voice_preload(ws, payload, runner))
|
||||||
|
elif mtype == "stt_response":
|
||||||
|
# Antwort auf unseren internen Transkriptions-Request
|
||||||
|
req_id = payload.get("requestId", "")
|
||||||
|
fut = _pending_stt.get(req_id)
|
||||||
|
if fut and not fut.done():
|
||||||
|
if payload.get("error"):
|
||||||
|
fut.set_result(None)
|
||||||
|
else:
|
||||||
|
fut.set_result(payload.get("text") or "")
|
||||||
|
elif mtype == "config":
|
||||||
|
v = (payload.get("xttsVoice") or "").strip()
|
||||||
|
if v and v != _last_diag_voice:
|
||||||
|
_last_diag_voice = v
|
||||||
|
asyncio.create_task(handle_voice_preload(
|
||||||
|
ws, {"voice": v, "source": "diagnostic"}, runner,
|
||||||
|
))
|
||||||
|
elif not v:
|
||||||
|
_last_diag_voice = ""
|
||||||
|
finally:
|
||||||
|
worker.cancel()
|
||||||
|
try:
|
||||||
|
await worker
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Verbindung verloren: %s", e)
|
||||||
|
if use_tls and RVS_TLS_FALLBACK and not tls_fallback_tried:
|
||||||
|
logger.info("TLS fehlgeschlagen — Fallback auf ws://")
|
||||||
|
use_tls = False
|
||||||
|
tls_fallback_tried = True
|
||||||
|
continue
|
||||||
|
await asyncio.sleep(min(retry_s, 30))
|
||||||
|
retry_s = min(retry_s * 2, 30)
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
if not RVS_HOST:
|
||||||
|
logger.error("RVS_HOST nicht gesetzt — Abbruch")
|
||||||
|
sys.exit(1)
|
||||||
|
VOICES_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
runner = F5Runner()
|
||||||
|
await run_loop(runner)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
asyncio.run(main())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
sys.exit(0)
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
f5-tts>=1.0.0
|
||||||
|
websockets>=12.0
|
||||||
|
numpy>=1.24
|
||||||
|
soundfile>=0.12
|
||||||
|
requests>=2.31
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
{
|
|
||||||
"name": "aria-xtts-bridge",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"private": true,
|
|
||||||
"dependencies": {
|
|
||||||
"ws": "^8.16.0"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Loading…
Reference in New Issue