feat: F5-TTS Tuning ueber Diagnostic statt .env
Folgt der "keine neuen Settings in .env" Regel.
f5tts/bridge.py:
- F5TTS_MODEL/CKPT_FILE/VOCAB_FILE/CFG_STRENGTH/NFE_STEP ENV-Vars raus
- Hard-coded Defaults im Code (DEFAULT_F5TTS_*)
- F5Runner besitzt Live-Settings als Instance-Vars + update_config()
- config-Broadcast triggert Modell-Reload nur wenn Modell-relevantes
sich aendert (cfg_strength/nfe_step ohne Reload)
- F5TTS_DEVICE bleibt ENV (Hardware-Bootstrap)
xtts/docker-compose.yml: F5TTS_* ENV-Vars rausgenommen, Kommentar
verweist auf Diagnostic-Config.
aria-bridge: nimmt f5tts*-Felder im config-Handler entgegen, persistiert
sie in voice_config.json. Beim RVS-Connect broadcastet die Bridge die
persistierte Config einmalig — damit die f5tts-bridge nach Container-
Restart automatisch die zuletzt gewaehlten Settings bekommt, ohne dass
der User in Diagnostic was klicken muss.
Diagnostic UI:
- Neuer aufklappbarer "F5-TTS Modell-Tuning (advanced)" Bereich
- Felder: Modell-ID, Custom-Checkpoint, Vocab, cfg_strength, nfe_step
- voice_config beim Laden: Felder werden zurueck in die UI gesetzt
- sendVoiceConfig schickt die neuen Felder mit
- Server: send_voice_config persistiert die Felder, leere Strings
werden geloescht damit die Hard-Defaults greifen
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -33,17 +33,14 @@ services:
|
||||
- ./voices:/voices # WAV + TXT Referenz
|
||||
- f5tts-models:/root/.cache/huggingface # Model-Cache persistieren
|
||||
environment:
|
||||
# Bootstrap-only — alle anderen F5-TTS-Settings (Modell, cfg_strength,
|
||||
# nfe_step, Custom-Checkpoint) kommen ueber Diagnostic via RVS-config.
|
||||
- RVS_HOST=${RVS_HOST}
|
||||
- RVS_PORT=${RVS_PORT:-443}
|
||||
- RVS_TLS=${RVS_TLS:-true}
|
||||
- RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true}
|
||||
- RVS_TOKEN=${RVS_TOKEN}
|
||||
- F5TTS_MODEL=${F5TTS_MODEL:-F5TTS_v1_Base}
|
||||
- F5TTS_CKPT_FILE=${F5TTS_CKPT_FILE:-}
|
||||
- F5TTS_VOCAB_FILE=${F5TTS_VOCAB_FILE:-}
|
||||
- F5TTS_DEVICE=${F5TTS_DEVICE:-cuda}
|
||||
- F5TTS_CFG_STRENGTH=${F5TTS_CFG_STRENGTH:-2.5}
|
||||
- F5TTS_NFE_STEP=${F5TTS_NFE_STEP:-32}
|
||||
- VOICES_DIR=/voices
|
||||
restart: unless-stopped
|
||||
|
||||
|
||||
+82
-21
@@ -52,15 +52,23 @@ RVS_TLS = os.getenv("RVS_TLS", "true").lower() == "true"
|
||||
RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true").lower() == "true"
|
||||
RVS_TOKEN = os.getenv("RVS_TOKEN", "").strip()
|
||||
|
||||
F5TTS_MODEL = os.getenv("F5TTS_MODEL", "F5TTS_v1_Base")
|
||||
F5TTS_CKPT_FILE = os.getenv("F5TTS_CKPT_FILE", "") # optional: HF-Repo oder lokales .pt
|
||||
F5TTS_VOCAB_FILE = os.getenv("F5TTS_VOCAB_FILE", "") # optional: zugehoerige vocab.txt
|
||||
F5TTS_DEVICE = os.getenv("F5TTS_DEVICE", "cuda")
|
||||
# F5-TTS Konfiguration
|
||||
# ─────────────────────────────────────────────────────────────────
|
||||
# Defaults sind hard-coded — bewusst KEINE ENV-Vars (ausser F5TTS_DEVICE,
|
||||
# weil Hardware-Bootstrap). Alle Settings werden zur Laufzeit via RVS
|
||||
# config-Broadcast aus Diagnostic uebersteuert (Felder f5ttsModel,
|
||||
# f5ttsCkptFile, f5ttsVocabFile, f5ttsCfgStrength, f5ttsNfeStep).
|
||||
F5TTS_DEVICE = os.getenv("F5TTS_DEVICE", "cuda") # nur Bootstrap
|
||||
|
||||
DEFAULT_F5TTS_MODEL = "F5TTS_v1_Base"
|
||||
DEFAULT_F5TTS_CKPT_FILE = "" # leer = Default-Checkpoint von HF
|
||||
DEFAULT_F5TTS_VOCAB_FILE = "" # leer = Default-Vocab vom Modell
|
||||
# cfg_strength: wie stark der Generator am Referenz-Voice klebt.
|
||||
# Default F5-TTS = 2.0. Bei nicht-EN/CN Sprachen (Deutsch!) hilft >2.5
|
||||
# Default F5-TTS = 2.0. Bei nicht-EN/CN Sprachen (Deutsch!) hilft 2.5+,
|
||||
# damit das Modell nicht in eine andere Sprache abrutscht.
|
||||
F5TTS_CFG_STRENGTH = float(os.getenv("F5TTS_CFG_STRENGTH", "2.5"))
|
||||
F5TTS_NFE_STEP = int(os.getenv("F5TTS_NFE_STEP", "32"))
|
||||
DEFAULT_F5TTS_CFG_STRENGTH = 2.5
|
||||
DEFAULT_F5TTS_NFE_STEP = 32
|
||||
|
||||
VOICES_DIR = Path(os.getenv("VOICES_DIR", "/voices"))
|
||||
|
||||
PCM_CHUNK_BYTES = 8192 # ~170ms @ 24kHz mono s16
|
||||
@@ -86,25 +94,36 @@ def _get_f5tts_cls():
|
||||
|
||||
|
||||
class F5Runner:
|
||||
"""Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking)."""
|
||||
"""Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking).
|
||||
|
||||
Live-Settings (Modell, cfg_strength, nfe_step) werden ueber update_config()
|
||||
aus dem Diagnostic-Config-Broadcast gesetzt; bei Modell-Wechsel wird
|
||||
automatisch neu geladen.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.model = None
|
||||
self._lock = asyncio.Lock()
|
||||
# Aktuelle Werte — gestartet mit Hard-Defaults, ueberschrieben von Diagnostic
|
||||
self.model_id: str = DEFAULT_F5TTS_MODEL
|
||||
self.ckpt_file: str = DEFAULT_F5TTS_CKPT_FILE
|
||||
self.vocab_file: str = DEFAULT_F5TTS_VOCAB_FILE
|
||||
self.cfg_strength: float = DEFAULT_F5TTS_CFG_STRENGTH
|
||||
self.nfe_step: int = DEFAULT_F5TTS_NFE_STEP
|
||||
|
||||
def _load_blocking(self) -> None:
|
||||
cls = _get_f5tts_cls()
|
||||
logger.info("Lade F5-TTS '%s' (device=%s, ckpt=%s)...",
|
||||
F5TTS_MODEL, F5TTS_DEVICE, F5TTS_CKPT_FILE or "default")
|
||||
self.model_id, F5TTS_DEVICE, self.ckpt_file or "default")
|
||||
t0 = time.time()
|
||||
kwargs = {"model": F5TTS_MODEL, "device": F5TTS_DEVICE}
|
||||
if F5TTS_CKPT_FILE:
|
||||
kwargs["ckpt_file"] = F5TTS_CKPT_FILE
|
||||
if F5TTS_VOCAB_FILE:
|
||||
kwargs["vocab_file"] = F5TTS_VOCAB_FILE
|
||||
kwargs = {"model": self.model_id, "device": F5TTS_DEVICE}
|
||||
if self.ckpt_file:
|
||||
kwargs["ckpt_file"] = self.ckpt_file
|
||||
if self.vocab_file:
|
||||
kwargs["vocab_file"] = self.vocab_file
|
||||
self.model = cls(**kwargs)
|
||||
logger.info("F5-TTS geladen in %.1fs (cfg_strength=%.1f, nfe=%d)",
|
||||
time.time() - t0, F5TTS_CFG_STRENGTH, F5TTS_NFE_STEP)
|
||||
time.time() - t0, self.cfg_strength, self.nfe_step)
|
||||
|
||||
async def ensure_loaded(self) -> None:
|
||||
async with self._lock:
|
||||
@@ -113,19 +132,58 @@ class F5Runner:
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, self._load_blocking)
|
||||
|
||||
async def update_config(self, payload: dict) -> None:
|
||||
"""Liest f5tts*-Felder aus einem config-Broadcast.
|
||||
Bei Modell-relevantem Wechsel wird neu geladen."""
|
||||
new_model = (payload.get("f5ttsModel") or "").strip() or self.model_id
|
||||
new_ckpt = payload.get("f5ttsCkptFile", self.ckpt_file) or ""
|
||||
new_vocab = payload.get("f5ttsVocabFile", self.vocab_file) or ""
|
||||
try:
|
||||
new_cfg = float(payload.get("f5ttsCfgStrength", self.cfg_strength))
|
||||
except (TypeError, ValueError):
|
||||
new_cfg = self.cfg_strength
|
||||
try:
|
||||
new_nfe = int(payload.get("f5ttsNfeStep", self.nfe_step))
|
||||
except (TypeError, ValueError):
|
||||
new_nfe = self.nfe_step
|
||||
|
||||
# Settings die KEINEN Modell-Reload brauchen (zur naechsten Synthese aktiv)
|
||||
self.cfg_strength = new_cfg
|
||||
self.nfe_step = new_nfe
|
||||
|
||||
# Settings die einen Reload triggern
|
||||
model_changed = (new_model != self.model_id
|
||||
or new_ckpt != self.ckpt_file
|
||||
or new_vocab != self.vocab_file)
|
||||
if model_changed:
|
||||
logger.info("F5-TTS Config-Wechsel: model=%s ckpt=%s vocab=%s — Reload",
|
||||
new_model, new_ckpt or "default", new_vocab or "default")
|
||||
self.model_id = new_model
|
||||
self.ckpt_file = new_ckpt
|
||||
self.vocab_file = new_vocab
|
||||
async with self._lock:
|
||||
old = self.model
|
||||
self.model = None
|
||||
# Alte Instanz freigeben
|
||||
try:
|
||||
if old is not None:
|
||||
del old
|
||||
except Exception:
|
||||
pass
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, self._load_blocking)
|
||||
else:
|
||||
logger.info("F5-TTS Live-Config: cfg_strength=%.2f nfe=%d", new_cfg, new_nfe)
|
||||
|
||||
def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
|
||||
# cfg_strength + nfe_step erhoeht damit das Modell nicht in andere
|
||||
# Sprachen abrutscht (Bug bei Deutsch: rutscht ohne Verstaerkung
|
||||
# gerne ins Spanische ab, weil F5TTS_v1_Base hauptsaechlich auf EN+CN
|
||||
# trainiert ist).
|
||||
wav, sr, _ = self.model.infer(
|
||||
ref_file=ref_wav,
|
||||
ref_text=ref_text,
|
||||
gen_text=gen_text,
|
||||
remove_silence=True,
|
||||
seed=-1,
|
||||
cfg_strength=F5TTS_CFG_STRENGTH,
|
||||
nfe_step=F5TTS_NFE_STEP,
|
||||
cfg_strength=self.cfg_strength,
|
||||
nfe_step=self.nfe_step,
|
||||
)
|
||||
# F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard
|
||||
if not isinstance(wav, np.ndarray):
|
||||
@@ -581,6 +639,9 @@ async def run_loop(runner: F5Runner) -> None:
|
||||
else:
|
||||
fut.set_result(payload.get("text") or "")
|
||||
elif mtype == "config":
|
||||
# F5-TTS-Settings aktualisieren (Modell, cfg_strength, nfe)
|
||||
asyncio.create_task(runner.update_config(payload))
|
||||
# Voice-Preload bei Wechsel
|
||||
v = (payload.get("xttsVoice") or "").strip()
|
||||
if v and v != _last_diag_voice:
|
||||
_last_diag_voice = v
|
||||
|
||||
Reference in New Issue
Block a user