diff --git a/bridge/aria_bridge.py b/bridge/aria_bridge.py
index 8219ba8..3ae635e 100644
--- a/bridge/aria_bridge.py
+++ b/bridge/aria_bridge.py
@@ -496,6 +496,7 @@ class ARIABridge:
# Komponenten (TTS: immer XTTS remote, Piper wurde entfernt)
self.tts_enabled = True
self.xtts_voice = ""
+ self._f5tts_config: dict = {}
vc: dict = {}
# Gespeicherte Voice-Config laden
try:
@@ -505,7 +506,16 @@ class ARIABridge:
vc = json.load(f)
self.tts_enabled = vc.get("ttsEnabled", True)
self.xtts_voice = vc.get("xttsVoice", "")
- logger.info("Voice-Config geladen: tts=%s voice=%s", self.tts_enabled, self.xtts_voice or "default")
+ # F5-TTS-Felder aufsammeln (werden spaeter via RVS rebroadcastet,
+ # damit die f5tts-bridge auf der Gamebox die Settings auch nach
+ # Restart wiederbekommt — sonst stuende sie auf Hard-Defaults)
+ for k in ("f5ttsModel", "f5ttsCkptFile", "f5ttsVocabFile",
+ "f5ttsCfgStrength", "f5ttsNfeStep"):
+ if k in vc:
+ self._f5tts_config[k] = vc[k]
+ logger.info("Voice-Config geladen: tts=%s voice=%s f5tts=%s",
+ self.tts_enabled, self.xtts_voice or "default",
+ self._f5tts_config or "defaults")
except Exception as e:
logger.warning("Voice-Config laden fehlgeschlagen: %s", e)
# Whisper-Modell: Config hat Vorrang, dann env/Default (medium)
@@ -963,6 +973,29 @@ class ARIABridge:
except Exception as e:
logger.debug("[mode] Broadcast fehlgeschlagen: %s", e)
+ async def _broadcast_persisted_config(self) -> None:
+ """Broadcastet die aktuelle voice_config.json einmalig nach RVS-Connect.
+
+ Damit bekommen frisch verbundene Bridges (insbesondere die f5tts-bridge
+ auf der Gamebox nach Container-Restart) die zuletzt in Diagnostic
+ gewaehlten Settings — ohne dass der User in Diagnostic was klicken muss.
+ """
+ try:
+ payload = {
+ "ttsEnabled": getattr(self, "tts_enabled", True),
+ "xttsVoice": getattr(self, "xtts_voice", ""),
+ "whisperModel": self.stt_engine.model_size,
+ }
+ payload.update(getattr(self, "_f5tts_config", {}) or {})
+ await self._send_to_rvs({
+ "type": "config",
+ "payload": payload,
+ "timestamp": int(asyncio.get_event_loop().time() * 1000),
+ })
+ logger.info("[rvs] Persistierte Config broadcastet: %s", payload)
+ except Exception as e:
+ logger.debug("[rvs] Config-Broadcast fehlgeschlagen: %s", e)
+
def _fetch_active_session(self) -> None:
"""Holt die aktive Session vom Diagnostic-Endpoint."""
try:
@@ -1032,6 +1065,12 @@ class ARIABridge:
# ihren UI-State sofort syncen koennen
await self._broadcast_current_mode()
+ # Persistierte Voice-Config broadcasten — die f5tts-bridge auf
+ # der Gamebox bekommt damit nach Restart die zuletzt in
+ # Diagnostic gewaehlten Settings wieder (sonst stuende sie auf
+ # ihren Hard-Defaults).
+ asyncio.create_task(self._broadcast_persisted_config())
+
# Heartbeat senden (RVS erwartet Ping alle 30s)
heartbeat_task = asyncio.create_task(self._rvs_heartbeat())
@@ -1195,7 +1234,10 @@ class ARIABridge:
return
elif msg_type == "config":
- # Konfiguration von App/Diagnostic empfangen + persistent speichern
+ # Konfiguration von App/Diagnostic empfangen + persistent speichern.
+ # Felder die nicht direkt zur aria-bridge gehoeren (f5tts*) werden
+ # nur persistiert; die f5tts-bridge auf der Gamebox empfaengt den
+ # gleichen RVS-Broadcast und reagiert selber.
changed = False
if "ttsEnabled" in payload:
self.tts_enabled = bool(payload["ttsEnabled"])
@@ -1209,14 +1251,19 @@ class ARIABridge:
new_model = payload["whisperModel"]
allowed = {"tiny", "base", "small", "medium", "large-v3"}
if new_model in allowed and new_model != self.stt_engine.model_size:
- # Merken und mitschicken an whisper-bridge (Gamebox).
- # Lokales Modell wird NICHT geladen — nur das Fallback braucht's,
- # und das passiert erst on-demand wenn Remote nicht antwortet.
logger.info("[rvs] Whisper-Modell → %s (nur Config; Modell laedt Gamebox)",
new_model)
self.stt_engine.model_size = new_model
self.stt_engine.model = None
changed = True
+ # F5-TTS-Felder: einfach persistieren, f5tts-bridge applied selber.
+ for k in ("f5ttsModel", "f5ttsCkptFile", "f5ttsVocabFile",
+ "f5ttsCfgStrength", "f5ttsNfeStep"):
+ if k in payload:
+ if not hasattr(self, "_f5tts_config"):
+ self._f5tts_config = {}
+ self._f5tts_config[k] = payload[k]
+ changed = True
# Persistent speichern in Shared Volume
if changed:
try:
@@ -1226,6 +1273,7 @@ class ARIABridge:
"xttsVoice": getattr(self, "xtts_voice", ""),
"whisperModel": self.stt_engine.model_size,
}
+ config_data.update(getattr(self, "_f5tts_config", {}))
with open("/shared/config/voice_config.json", "w") as f:
json.dump(config_data, f, indent=2)
logger.info("[rvs] Voice-Config gespeichert: %s", config_data)
diff --git a/diagnostic/index.html b/diagnostic/index.html
index 9d32de2..33577f7 100644
--- a/diagnostic/index.html
+++ b/diagnostic/index.html
@@ -450,6 +450,58 @@
+
+
+ F5-TTS Modell-Tuning (advanced)
+
+
+ Werden via RVS an die f5tts-bridge auf der Gamebox geschickt.
+ Modell-/Checkpoint-Wechsel triggert einen Reload (~30s).
+ Hardcoded Defaults: F5TTS_v1_Base, cfg_strength=2.5, nfe_step=32.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Hoeher = klebt staerker an Referenz
+
+
+
+
+
Hoeher = bessere Qualitaet, langsamer
+
+
+
+
+
+
+
Stimme klonen
@@ -841,6 +893,16 @@
const wSel = document.getElementById('diag-whisper-model');
if (wSel) wSel.value = msg.whisperModel;
}
+ // F5-TTS Tuning-Felder wiederherstellen (falls gesetzt)
+ const setIfPresent = (id, val) => {
+ const el = document.getElementById(id);
+ if (el && val !== undefined && val !== null && val !== '') el.value = val;
+ };
+ setIfPresent('diag-f5tts-model', msg.f5ttsModel);
+ setIfPresent('diag-f5tts-ckpt', msg.f5ttsCkptFile);
+ setIfPresent('diag-f5tts-vocab', msg.f5ttsVocabFile);
+ setIfPresent('diag-f5tts-cfg', msg.f5ttsCfgStrength);
+ setIfPresent('diag-f5tts-nfe', msg.f5ttsNfeStep);
return;
}
@@ -1570,7 +1632,19 @@
const ttsEnabled = document.getElementById('diag-tts-enabled').checked;
const xttsVoice = document.getElementById('diag-xtts-voice').value;
const whisperModel = document.getElementById('diag-whisper-model').value;
- send({ action: 'send_voice_config', ttsEnabled, xttsVoice, whisperModel });
+ const f5ttsModel = document.getElementById('diag-f5tts-model')?.value || '';
+ const f5ttsCkptFile = document.getElementById('diag-f5tts-ckpt')?.value || '';
+ const f5ttsVocabFile = document.getElementById('diag-f5tts-vocab')?.value || '';
+ const f5ttsCfgRaw = document.getElementById('diag-f5tts-cfg')?.value || '';
+ const f5ttsNfeRaw = document.getElementById('diag-f5tts-nfe')?.value || '';
+ const f5ttsCfgStrength = f5ttsCfgRaw ? parseFloat(f5ttsCfgRaw) : undefined;
+ const f5ttsNfeStep = f5ttsNfeRaw ? parseInt(f5ttsNfeRaw, 10) : undefined;
+ send({
+ action: 'send_voice_config',
+ ttsEnabled, xttsVoice, whisperModel,
+ f5ttsModel, f5ttsCkptFile, f5ttsVocabFile,
+ f5ttsCfgStrength, f5ttsNfeStep,
+ });
const statusEl = document.getElementById('voice-status');
if (statusEl && xttsVoice) {
statusEl.textContent = `⏳ Stimme "${xttsVoice}" wird geladen...`;
diff --git a/diagnostic/server.js b/diagnostic/server.js
index 3ed43e6..a780680 100644
--- a/diagnostic/server.js
+++ b/diagnostic/server.js
@@ -1423,6 +1423,25 @@ wss.on("connection", (ws) => {
xttsVoice: msg.xttsVoice || "",
};
if (msg.whisperModel !== undefined) voiceConfig.whisperModel = msg.whisperModel;
+ // F5-TTS Tuning-Felder — leere Strings entfernen damit der Default greift
+ if (msg.f5ttsModel !== undefined) {
+ if (msg.f5ttsModel) voiceConfig.f5ttsModel = msg.f5ttsModel;
+ else delete voiceConfig.f5ttsModel;
+ }
+ if (msg.f5ttsCkptFile !== undefined) {
+ if (msg.f5ttsCkptFile) voiceConfig.f5ttsCkptFile = msg.f5ttsCkptFile;
+ else delete voiceConfig.f5ttsCkptFile;
+ }
+ if (msg.f5ttsVocabFile !== undefined) {
+ if (msg.f5ttsVocabFile) voiceConfig.f5ttsVocabFile = msg.f5ttsVocabFile;
+ else delete voiceConfig.f5ttsVocabFile;
+ }
+ if (msg.f5ttsCfgStrength !== undefined && !isNaN(msg.f5ttsCfgStrength)) {
+ voiceConfig.f5ttsCfgStrength = msg.f5ttsCfgStrength;
+ }
+ if (msg.f5ttsNfeStep !== undefined && !isNaN(msg.f5ttsNfeStep)) {
+ voiceConfig.f5ttsNfeStep = msg.f5ttsNfeStep;
+ }
try {
fs.mkdirSync("/shared/config", { recursive: true });
fs.writeFileSync("/shared/config/voice_config.json", JSON.stringify(voiceConfig, null, 2));
diff --git a/xtts/docker-compose.yml b/xtts/docker-compose.yml
index 24f4c12..f4b8482 100644
--- a/xtts/docker-compose.yml
+++ b/xtts/docker-compose.yml
@@ -33,17 +33,14 @@ services:
- ./voices:/voices # WAV + TXT Referenz
- f5tts-models:/root/.cache/huggingface # Model-Cache persistieren
environment:
+ # Bootstrap-only — alle anderen F5-TTS-Settings (Modell, cfg_strength,
+ # nfe_step, Custom-Checkpoint) kommen ueber Diagnostic via RVS-config.
- RVS_HOST=${RVS_HOST}
- RVS_PORT=${RVS_PORT:-443}
- RVS_TLS=${RVS_TLS:-true}
- RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true}
- RVS_TOKEN=${RVS_TOKEN}
- - F5TTS_MODEL=${F5TTS_MODEL:-F5TTS_v1_Base}
- - F5TTS_CKPT_FILE=${F5TTS_CKPT_FILE:-}
- - F5TTS_VOCAB_FILE=${F5TTS_VOCAB_FILE:-}
- F5TTS_DEVICE=${F5TTS_DEVICE:-cuda}
- - F5TTS_CFG_STRENGTH=${F5TTS_CFG_STRENGTH:-2.5}
- - F5TTS_NFE_STEP=${F5TTS_NFE_STEP:-32}
- VOICES_DIR=/voices
restart: unless-stopped
diff --git a/xtts/f5tts/bridge.py b/xtts/f5tts/bridge.py
index 3330cb3..7ae4cee 100644
--- a/xtts/f5tts/bridge.py
+++ b/xtts/f5tts/bridge.py
@@ -52,15 +52,23 @@ RVS_TLS = os.getenv("RVS_TLS", "true").lower() == "true"
RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true").lower() == "true"
RVS_TOKEN = os.getenv("RVS_TOKEN", "").strip()
-F5TTS_MODEL = os.getenv("F5TTS_MODEL", "F5TTS_v1_Base")
-F5TTS_CKPT_FILE = os.getenv("F5TTS_CKPT_FILE", "") # optional: HF-Repo oder lokales .pt
-F5TTS_VOCAB_FILE = os.getenv("F5TTS_VOCAB_FILE", "") # optional: zugehoerige vocab.txt
-F5TTS_DEVICE = os.getenv("F5TTS_DEVICE", "cuda")
+# F5-TTS Konfiguration
+# ─────────────────────────────────────────────────────────────────
+# Defaults sind hard-coded — bewusst KEINE ENV-Vars (ausser F5TTS_DEVICE,
+# weil Hardware-Bootstrap). Alle Settings werden zur Laufzeit via RVS
+# config-Broadcast aus Diagnostic uebersteuert (Felder f5ttsModel,
+# f5ttsCkptFile, f5ttsVocabFile, f5ttsCfgStrength, f5ttsNfeStep).
+F5TTS_DEVICE = os.getenv("F5TTS_DEVICE", "cuda") # nur Bootstrap
+
+DEFAULT_F5TTS_MODEL = "F5TTS_v1_Base"
+DEFAULT_F5TTS_CKPT_FILE = "" # leer = Default-Checkpoint von HF
+DEFAULT_F5TTS_VOCAB_FILE = "" # leer = Default-Vocab vom Modell
# cfg_strength: wie stark der Generator am Referenz-Voice klebt.
-# Default F5-TTS = 2.0. Bei nicht-EN/CN Sprachen (Deutsch!) hilft >2.5
+# Default F5-TTS = 2.0. Bei nicht-EN/CN Sprachen (Deutsch!) hilft 2.5+,
# damit das Modell nicht in eine andere Sprache abrutscht.
-F5TTS_CFG_STRENGTH = float(os.getenv("F5TTS_CFG_STRENGTH", "2.5"))
-F5TTS_NFE_STEP = int(os.getenv("F5TTS_NFE_STEP", "32"))
+DEFAULT_F5TTS_CFG_STRENGTH = 2.5
+DEFAULT_F5TTS_NFE_STEP = 32
+
VOICES_DIR = Path(os.getenv("VOICES_DIR", "/voices"))
PCM_CHUNK_BYTES = 8192 # ~170ms @ 24kHz mono s16
@@ -86,25 +94,36 @@ def _get_f5tts_cls():
class F5Runner:
- """Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking)."""
+ """Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking).
+
+ Live-Settings (Modell, cfg_strength, nfe_step) werden ueber update_config()
+ aus dem Diagnostic-Config-Broadcast gesetzt; bei Modell-Wechsel wird
+ automatisch neu geladen.
+ """
def __init__(self) -> None:
self.model = None
self._lock = asyncio.Lock()
+ # Aktuelle Werte — gestartet mit Hard-Defaults, ueberschrieben von Diagnostic
+ self.model_id: str = DEFAULT_F5TTS_MODEL
+ self.ckpt_file: str = DEFAULT_F5TTS_CKPT_FILE
+ self.vocab_file: str = DEFAULT_F5TTS_VOCAB_FILE
+ self.cfg_strength: float = DEFAULT_F5TTS_CFG_STRENGTH
+ self.nfe_step: int = DEFAULT_F5TTS_NFE_STEP
def _load_blocking(self) -> None:
cls = _get_f5tts_cls()
logger.info("Lade F5-TTS '%s' (device=%s, ckpt=%s)...",
- F5TTS_MODEL, F5TTS_DEVICE, F5TTS_CKPT_FILE or "default")
+ self.model_id, F5TTS_DEVICE, self.ckpt_file or "default")
t0 = time.time()
- kwargs = {"model": F5TTS_MODEL, "device": F5TTS_DEVICE}
- if F5TTS_CKPT_FILE:
- kwargs["ckpt_file"] = F5TTS_CKPT_FILE
- if F5TTS_VOCAB_FILE:
- kwargs["vocab_file"] = F5TTS_VOCAB_FILE
+ kwargs = {"model": self.model_id, "device": F5TTS_DEVICE}
+ if self.ckpt_file:
+ kwargs["ckpt_file"] = self.ckpt_file
+ if self.vocab_file:
+ kwargs["vocab_file"] = self.vocab_file
self.model = cls(**kwargs)
logger.info("F5-TTS geladen in %.1fs (cfg_strength=%.1f, nfe=%d)",
- time.time() - t0, F5TTS_CFG_STRENGTH, F5TTS_NFE_STEP)
+ time.time() - t0, self.cfg_strength, self.nfe_step)
async def ensure_loaded(self) -> None:
async with self._lock:
@@ -113,19 +132,58 @@ class F5Runner:
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, self._load_blocking)
+ async def update_config(self, payload: dict) -> None:
+ """Liest f5tts*-Felder aus einem config-Broadcast.
+ Bei Modell-relevantem Wechsel wird neu geladen."""
+ new_model = (payload.get("f5ttsModel") or "").strip() or self.model_id
+ new_ckpt = payload.get("f5ttsCkptFile", self.ckpt_file) or ""
+ new_vocab = payload.get("f5ttsVocabFile", self.vocab_file) or ""
+ try:
+ new_cfg = float(payload.get("f5ttsCfgStrength", self.cfg_strength))
+ except (TypeError, ValueError):
+ new_cfg = self.cfg_strength
+ try:
+ new_nfe = int(payload.get("f5ttsNfeStep", self.nfe_step))
+ except (TypeError, ValueError):
+ new_nfe = self.nfe_step
+
+ # Settings die KEINEN Modell-Reload brauchen (zur naechsten Synthese aktiv)
+ self.cfg_strength = new_cfg
+ self.nfe_step = new_nfe
+
+ # Settings die einen Reload triggern
+ model_changed = (new_model != self.model_id
+ or new_ckpt != self.ckpt_file
+ or new_vocab != self.vocab_file)
+ if model_changed:
+ logger.info("F5-TTS Config-Wechsel: model=%s ckpt=%s vocab=%s — Reload",
+ new_model, new_ckpt or "default", new_vocab or "default")
+ self.model_id = new_model
+ self.ckpt_file = new_ckpt
+ self.vocab_file = new_vocab
+ async with self._lock:
+ old = self.model
+ self.model = None
+ # Alte Instanz freigeben
+ try:
+ if old is not None:
+ del old
+ except Exception:
+ pass
+ loop = asyncio.get_event_loop()
+ await loop.run_in_executor(None, self._load_blocking)
+ else:
+ logger.info("F5-TTS Live-Config: cfg_strength=%.2f nfe=%d", new_cfg, new_nfe)
+
def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
- # cfg_strength + nfe_step erhoeht damit das Modell nicht in andere
- # Sprachen abrutscht (Bug bei Deutsch: rutscht ohne Verstaerkung
- # gerne ins Spanische ab, weil F5TTS_v1_Base hauptsaechlich auf EN+CN
- # trainiert ist).
wav, sr, _ = self.model.infer(
ref_file=ref_wav,
ref_text=ref_text,
gen_text=gen_text,
remove_silence=True,
seed=-1,
- cfg_strength=F5TTS_CFG_STRENGTH,
- nfe_step=F5TTS_NFE_STEP,
+ cfg_strength=self.cfg_strength,
+ nfe_step=self.nfe_step,
)
# F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard
if not isinstance(wav, np.ndarray):
@@ -581,6 +639,9 @@ async def run_loop(runner: F5Runner) -> None:
else:
fut.set_result(payload.get("text") or "")
elif mtype == "config":
+ # F5-TTS-Settings aktualisieren (Modell, cfg_strength, nfe)
+ asyncio.create_task(runner.update_config(payload))
+ # Voice-Preload bei Wechsel
v = (payload.get("xttsVoice") or "").strip()
if v and v != _last_diag_voice:
_last_diag_voice = v