diff --git a/bridge/aria_bridge.py b/bridge/aria_bridge.py index 8219ba8..3ae635e 100644 --- a/bridge/aria_bridge.py +++ b/bridge/aria_bridge.py @@ -496,6 +496,7 @@ class ARIABridge: # Komponenten (TTS: immer XTTS remote, Piper wurde entfernt) self.tts_enabled = True self.xtts_voice = "" + self._f5tts_config: dict = {} vc: dict = {} # Gespeicherte Voice-Config laden try: @@ -505,7 +506,16 @@ class ARIABridge: vc = json.load(f) self.tts_enabled = vc.get("ttsEnabled", True) self.xtts_voice = vc.get("xttsVoice", "") - logger.info("Voice-Config geladen: tts=%s voice=%s", self.tts_enabled, self.xtts_voice or "default") + # F5-TTS-Felder aufsammeln (werden spaeter via RVS rebroadcastet, + # damit die f5tts-bridge auf der Gamebox die Settings auch nach + # Restart wiederbekommt — sonst stuende sie auf Hard-Defaults) + for k in ("f5ttsModel", "f5ttsCkptFile", "f5ttsVocabFile", + "f5ttsCfgStrength", "f5ttsNfeStep"): + if k in vc: + self._f5tts_config[k] = vc[k] + logger.info("Voice-Config geladen: tts=%s voice=%s f5tts=%s", + self.tts_enabled, self.xtts_voice or "default", + self._f5tts_config or "defaults") except Exception as e: logger.warning("Voice-Config laden fehlgeschlagen: %s", e) # Whisper-Modell: Config hat Vorrang, dann env/Default (medium) @@ -963,6 +973,29 @@ class ARIABridge: except Exception as e: logger.debug("[mode] Broadcast fehlgeschlagen: %s", e) + async def _broadcast_persisted_config(self) -> None: + """Broadcastet die aktuelle voice_config.json einmalig nach RVS-Connect. + + Damit bekommen frisch verbundene Bridges (insbesondere die f5tts-bridge + auf der Gamebox nach Container-Restart) die zuletzt in Diagnostic + gewaehlten Settings — ohne dass der User in Diagnostic was klicken muss. + """ + try: + payload = { + "ttsEnabled": getattr(self, "tts_enabled", True), + "xttsVoice": getattr(self, "xtts_voice", ""), + "whisperModel": self.stt_engine.model_size, + } + payload.update(getattr(self, "_f5tts_config", {}) or {}) + await self._send_to_rvs({ + "type": "config", + "payload": payload, + "timestamp": int(asyncio.get_event_loop().time() * 1000), + }) + logger.info("[rvs] Persistierte Config broadcastet: %s", payload) + except Exception as e: + logger.debug("[rvs] Config-Broadcast fehlgeschlagen: %s", e) + def _fetch_active_session(self) -> None: """Holt die aktive Session vom Diagnostic-Endpoint.""" try: @@ -1032,6 +1065,12 @@ class ARIABridge: # ihren UI-State sofort syncen koennen await self._broadcast_current_mode() + # Persistierte Voice-Config broadcasten — die f5tts-bridge auf + # der Gamebox bekommt damit nach Restart die zuletzt in + # Diagnostic gewaehlten Settings wieder (sonst stuende sie auf + # ihren Hard-Defaults). + asyncio.create_task(self._broadcast_persisted_config()) + # Heartbeat senden (RVS erwartet Ping alle 30s) heartbeat_task = asyncio.create_task(self._rvs_heartbeat()) @@ -1195,7 +1234,10 @@ class ARIABridge: return elif msg_type == "config": - # Konfiguration von App/Diagnostic empfangen + persistent speichern + # Konfiguration von App/Diagnostic empfangen + persistent speichern. + # Felder die nicht direkt zur aria-bridge gehoeren (f5tts*) werden + # nur persistiert; die f5tts-bridge auf der Gamebox empfaengt den + # gleichen RVS-Broadcast und reagiert selber. changed = False if "ttsEnabled" in payload: self.tts_enabled = bool(payload["ttsEnabled"]) @@ -1209,14 +1251,19 @@ class ARIABridge: new_model = payload["whisperModel"] allowed = {"tiny", "base", "small", "medium", "large-v3"} if new_model in allowed and new_model != self.stt_engine.model_size: - # Merken und mitschicken an whisper-bridge (Gamebox). - # Lokales Modell wird NICHT geladen — nur das Fallback braucht's, - # und das passiert erst on-demand wenn Remote nicht antwortet. logger.info("[rvs] Whisper-Modell → %s (nur Config; Modell laedt Gamebox)", new_model) self.stt_engine.model_size = new_model self.stt_engine.model = None changed = True + # F5-TTS-Felder: einfach persistieren, f5tts-bridge applied selber. + for k in ("f5ttsModel", "f5ttsCkptFile", "f5ttsVocabFile", + "f5ttsCfgStrength", "f5ttsNfeStep"): + if k in payload: + if not hasattr(self, "_f5tts_config"): + self._f5tts_config = {} + self._f5tts_config[k] = payload[k] + changed = True # Persistent speichern in Shared Volume if changed: try: @@ -1226,6 +1273,7 @@ class ARIABridge: "xttsVoice": getattr(self, "xtts_voice", ""), "whisperModel": self.stt_engine.model_size, } + config_data.update(getattr(self, "_f5tts_config", {})) with open("/shared/config/voice_config.json", "w") as f: json.dump(config_data, f, indent=2) logger.info("[rvs] Voice-Config gespeichert: %s", config_data) diff --git a/diagnostic/index.html b/diagnostic/index.html index 9d32de2..33577f7 100644 --- a/diagnostic/index.html +++ b/diagnostic/index.html @@ -450,6 +450,58 @@
+ +
+ F5-TTS Modell-Tuning (advanced) +
+
+ Werden via RVS an die f5tts-bridge auf der Gamebox geschickt. + Modell-/Checkpoint-Wechsel triggert einen Reload (~30s). + Hardcoded Defaults: F5TTS_v1_Base, cfg_strength=2.5, nfe_step=32. +
+ + + + + + + + + + +
+
+ + +
Hoeher = klebt staerker an Referenz
+
+
+ + +
Hoeher = bessere Qualitaet, langsamer
+
+
+ + +
+
+
Stimme klonen
@@ -841,6 +893,16 @@ const wSel = document.getElementById('diag-whisper-model'); if (wSel) wSel.value = msg.whisperModel; } + // F5-TTS Tuning-Felder wiederherstellen (falls gesetzt) + const setIfPresent = (id, val) => { + const el = document.getElementById(id); + if (el && val !== undefined && val !== null && val !== '') el.value = val; + }; + setIfPresent('diag-f5tts-model', msg.f5ttsModel); + setIfPresent('diag-f5tts-ckpt', msg.f5ttsCkptFile); + setIfPresent('diag-f5tts-vocab', msg.f5ttsVocabFile); + setIfPresent('diag-f5tts-cfg', msg.f5ttsCfgStrength); + setIfPresent('diag-f5tts-nfe', msg.f5ttsNfeStep); return; } @@ -1570,7 +1632,19 @@ const ttsEnabled = document.getElementById('diag-tts-enabled').checked; const xttsVoice = document.getElementById('diag-xtts-voice').value; const whisperModel = document.getElementById('diag-whisper-model').value; - send({ action: 'send_voice_config', ttsEnabled, xttsVoice, whisperModel }); + const f5ttsModel = document.getElementById('diag-f5tts-model')?.value || ''; + const f5ttsCkptFile = document.getElementById('diag-f5tts-ckpt')?.value || ''; + const f5ttsVocabFile = document.getElementById('diag-f5tts-vocab')?.value || ''; + const f5ttsCfgRaw = document.getElementById('diag-f5tts-cfg')?.value || ''; + const f5ttsNfeRaw = document.getElementById('diag-f5tts-nfe')?.value || ''; + const f5ttsCfgStrength = f5ttsCfgRaw ? parseFloat(f5ttsCfgRaw) : undefined; + const f5ttsNfeStep = f5ttsNfeRaw ? parseInt(f5ttsNfeRaw, 10) : undefined; + send({ + action: 'send_voice_config', + ttsEnabled, xttsVoice, whisperModel, + f5ttsModel, f5ttsCkptFile, f5ttsVocabFile, + f5ttsCfgStrength, f5ttsNfeStep, + }); const statusEl = document.getElementById('voice-status'); if (statusEl && xttsVoice) { statusEl.textContent = `⏳ Stimme "${xttsVoice}" wird geladen...`; diff --git a/diagnostic/server.js b/diagnostic/server.js index 3ed43e6..a780680 100644 --- a/diagnostic/server.js +++ b/diagnostic/server.js @@ -1423,6 +1423,25 @@ wss.on("connection", (ws) => { xttsVoice: msg.xttsVoice || "", }; if (msg.whisperModel !== undefined) voiceConfig.whisperModel = msg.whisperModel; + // F5-TTS Tuning-Felder — leere Strings entfernen damit der Default greift + if (msg.f5ttsModel !== undefined) { + if (msg.f5ttsModel) voiceConfig.f5ttsModel = msg.f5ttsModel; + else delete voiceConfig.f5ttsModel; + } + if (msg.f5ttsCkptFile !== undefined) { + if (msg.f5ttsCkptFile) voiceConfig.f5ttsCkptFile = msg.f5ttsCkptFile; + else delete voiceConfig.f5ttsCkptFile; + } + if (msg.f5ttsVocabFile !== undefined) { + if (msg.f5ttsVocabFile) voiceConfig.f5ttsVocabFile = msg.f5ttsVocabFile; + else delete voiceConfig.f5ttsVocabFile; + } + if (msg.f5ttsCfgStrength !== undefined && !isNaN(msg.f5ttsCfgStrength)) { + voiceConfig.f5ttsCfgStrength = msg.f5ttsCfgStrength; + } + if (msg.f5ttsNfeStep !== undefined && !isNaN(msg.f5ttsNfeStep)) { + voiceConfig.f5ttsNfeStep = msg.f5ttsNfeStep; + } try { fs.mkdirSync("/shared/config", { recursive: true }); fs.writeFileSync("/shared/config/voice_config.json", JSON.stringify(voiceConfig, null, 2)); diff --git a/xtts/docker-compose.yml b/xtts/docker-compose.yml index 24f4c12..f4b8482 100644 --- a/xtts/docker-compose.yml +++ b/xtts/docker-compose.yml @@ -33,17 +33,14 @@ services: - ./voices:/voices # WAV + TXT Referenz - f5tts-models:/root/.cache/huggingface # Model-Cache persistieren environment: + # Bootstrap-only — alle anderen F5-TTS-Settings (Modell, cfg_strength, + # nfe_step, Custom-Checkpoint) kommen ueber Diagnostic via RVS-config. - RVS_HOST=${RVS_HOST} - RVS_PORT=${RVS_PORT:-443} - RVS_TLS=${RVS_TLS:-true} - RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true} - RVS_TOKEN=${RVS_TOKEN} - - F5TTS_MODEL=${F5TTS_MODEL:-F5TTS_v1_Base} - - F5TTS_CKPT_FILE=${F5TTS_CKPT_FILE:-} - - F5TTS_VOCAB_FILE=${F5TTS_VOCAB_FILE:-} - F5TTS_DEVICE=${F5TTS_DEVICE:-cuda} - - F5TTS_CFG_STRENGTH=${F5TTS_CFG_STRENGTH:-2.5} - - F5TTS_NFE_STEP=${F5TTS_NFE_STEP:-32} - VOICES_DIR=/voices restart: unless-stopped diff --git a/xtts/f5tts/bridge.py b/xtts/f5tts/bridge.py index 3330cb3..7ae4cee 100644 --- a/xtts/f5tts/bridge.py +++ b/xtts/f5tts/bridge.py @@ -52,15 +52,23 @@ RVS_TLS = os.getenv("RVS_TLS", "true").lower() == "true" RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true").lower() == "true" RVS_TOKEN = os.getenv("RVS_TOKEN", "").strip() -F5TTS_MODEL = os.getenv("F5TTS_MODEL", "F5TTS_v1_Base") -F5TTS_CKPT_FILE = os.getenv("F5TTS_CKPT_FILE", "") # optional: HF-Repo oder lokales .pt -F5TTS_VOCAB_FILE = os.getenv("F5TTS_VOCAB_FILE", "") # optional: zugehoerige vocab.txt -F5TTS_DEVICE = os.getenv("F5TTS_DEVICE", "cuda") +# F5-TTS Konfiguration +# ───────────────────────────────────────────────────────────────── +# Defaults sind hard-coded — bewusst KEINE ENV-Vars (ausser F5TTS_DEVICE, +# weil Hardware-Bootstrap). Alle Settings werden zur Laufzeit via RVS +# config-Broadcast aus Diagnostic uebersteuert (Felder f5ttsModel, +# f5ttsCkptFile, f5ttsVocabFile, f5ttsCfgStrength, f5ttsNfeStep). +F5TTS_DEVICE = os.getenv("F5TTS_DEVICE", "cuda") # nur Bootstrap + +DEFAULT_F5TTS_MODEL = "F5TTS_v1_Base" +DEFAULT_F5TTS_CKPT_FILE = "" # leer = Default-Checkpoint von HF +DEFAULT_F5TTS_VOCAB_FILE = "" # leer = Default-Vocab vom Modell # cfg_strength: wie stark der Generator am Referenz-Voice klebt. -# Default F5-TTS = 2.0. Bei nicht-EN/CN Sprachen (Deutsch!) hilft >2.5 +# Default F5-TTS = 2.0. Bei nicht-EN/CN Sprachen (Deutsch!) hilft 2.5+, # damit das Modell nicht in eine andere Sprache abrutscht. -F5TTS_CFG_STRENGTH = float(os.getenv("F5TTS_CFG_STRENGTH", "2.5")) -F5TTS_NFE_STEP = int(os.getenv("F5TTS_NFE_STEP", "32")) +DEFAULT_F5TTS_CFG_STRENGTH = 2.5 +DEFAULT_F5TTS_NFE_STEP = 32 + VOICES_DIR = Path(os.getenv("VOICES_DIR", "/voices")) PCM_CHUNK_BYTES = 8192 # ~170ms @ 24kHz mono s16 @@ -86,25 +94,36 @@ def _get_f5tts_cls(): class F5Runner: - """Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking).""" + """Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking). + + Live-Settings (Modell, cfg_strength, nfe_step) werden ueber update_config() + aus dem Diagnostic-Config-Broadcast gesetzt; bei Modell-Wechsel wird + automatisch neu geladen. + """ def __init__(self) -> None: self.model = None self._lock = asyncio.Lock() + # Aktuelle Werte — gestartet mit Hard-Defaults, ueberschrieben von Diagnostic + self.model_id: str = DEFAULT_F5TTS_MODEL + self.ckpt_file: str = DEFAULT_F5TTS_CKPT_FILE + self.vocab_file: str = DEFAULT_F5TTS_VOCAB_FILE + self.cfg_strength: float = DEFAULT_F5TTS_CFG_STRENGTH + self.nfe_step: int = DEFAULT_F5TTS_NFE_STEP def _load_blocking(self) -> None: cls = _get_f5tts_cls() logger.info("Lade F5-TTS '%s' (device=%s, ckpt=%s)...", - F5TTS_MODEL, F5TTS_DEVICE, F5TTS_CKPT_FILE or "default") + self.model_id, F5TTS_DEVICE, self.ckpt_file or "default") t0 = time.time() - kwargs = {"model": F5TTS_MODEL, "device": F5TTS_DEVICE} - if F5TTS_CKPT_FILE: - kwargs["ckpt_file"] = F5TTS_CKPT_FILE - if F5TTS_VOCAB_FILE: - kwargs["vocab_file"] = F5TTS_VOCAB_FILE + kwargs = {"model": self.model_id, "device": F5TTS_DEVICE} + if self.ckpt_file: + kwargs["ckpt_file"] = self.ckpt_file + if self.vocab_file: + kwargs["vocab_file"] = self.vocab_file self.model = cls(**kwargs) logger.info("F5-TTS geladen in %.1fs (cfg_strength=%.1f, nfe=%d)", - time.time() - t0, F5TTS_CFG_STRENGTH, F5TTS_NFE_STEP) + time.time() - t0, self.cfg_strength, self.nfe_step) async def ensure_loaded(self) -> None: async with self._lock: @@ -113,19 +132,58 @@ class F5Runner: loop = asyncio.get_event_loop() await loop.run_in_executor(None, self._load_blocking) + async def update_config(self, payload: dict) -> None: + """Liest f5tts*-Felder aus einem config-Broadcast. + Bei Modell-relevantem Wechsel wird neu geladen.""" + new_model = (payload.get("f5ttsModel") or "").strip() or self.model_id + new_ckpt = payload.get("f5ttsCkptFile", self.ckpt_file) or "" + new_vocab = payload.get("f5ttsVocabFile", self.vocab_file) or "" + try: + new_cfg = float(payload.get("f5ttsCfgStrength", self.cfg_strength)) + except (TypeError, ValueError): + new_cfg = self.cfg_strength + try: + new_nfe = int(payload.get("f5ttsNfeStep", self.nfe_step)) + except (TypeError, ValueError): + new_nfe = self.nfe_step + + # Settings die KEINEN Modell-Reload brauchen (zur naechsten Synthese aktiv) + self.cfg_strength = new_cfg + self.nfe_step = new_nfe + + # Settings die einen Reload triggern + model_changed = (new_model != self.model_id + or new_ckpt != self.ckpt_file + or new_vocab != self.vocab_file) + if model_changed: + logger.info("F5-TTS Config-Wechsel: model=%s ckpt=%s vocab=%s — Reload", + new_model, new_ckpt or "default", new_vocab or "default") + self.model_id = new_model + self.ckpt_file = new_ckpt + self.vocab_file = new_vocab + async with self._lock: + old = self.model + self.model = None + # Alte Instanz freigeben + try: + if old is not None: + del old + except Exception: + pass + loop = asyncio.get_event_loop() + await loop.run_in_executor(None, self._load_blocking) + else: + logger.info("F5-TTS Live-Config: cfg_strength=%.2f nfe=%d", new_cfg, new_nfe) + def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]: - # cfg_strength + nfe_step erhoeht damit das Modell nicht in andere - # Sprachen abrutscht (Bug bei Deutsch: rutscht ohne Verstaerkung - # gerne ins Spanische ab, weil F5TTS_v1_Base hauptsaechlich auf EN+CN - # trainiert ist). wav, sr, _ = self.model.infer( ref_file=ref_wav, ref_text=ref_text, gen_text=gen_text, remove_silence=True, seed=-1, - cfg_strength=F5TTS_CFG_STRENGTH, - nfe_step=F5TTS_NFE_STEP, + cfg_strength=self.cfg_strength, + nfe_step=self.nfe_step, ) # F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard if not isinstance(wav, np.ndarray): @@ -581,6 +639,9 @@ async def run_loop(runner: F5Runner) -> None: else: fut.set_result(payload.get("text") or "") elif mtype == "config": + # F5-TTS-Settings aktualisieren (Modell, cfg_strength, nfe) + asyncio.create_task(runner.update_config(payload)) + # Voice-Preload bei Wechsel v = (payload.get("xttsVoice") or "").strip() if v and v != _last_diag_voice: _last_diag_voice = v