Compare commits

...

2 Commits

Author SHA1 Message Date
duffyduck b373f915b5 feat(f5tts): HF-URL Support fuer Custom Checkpoints (aihpi/F5-TTS-German)
_resolve_hf_path wandelt hf://user/repo/path → lokaler Download via
huggingface_hub.hf_hub_download. So kann man in Diagnostic einfach die
HF-Pfade fuer custom Modelle reinschreiben, ohne erst manuell zu
downloaden + zu mounten.

Format: hf://aihpi/F5-TTS-German/F5TTS_Base/model_365000.safetensors
        hf://aihpi/F5-TTS-German/vocab.txt

Diagnostic UI: Placeholders + Labels angepasst mit Beispiel-HF-Pfaden
und Hinweis dass fuer Fine-Tunes "F5TTS_Base" statt "F5TTS_v1_Base"
als Architektur-Name gesetzt werden muss.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 19:16:44 +02:00
duffyduck 7748834a0f fix(f5tts): Ref-WAV Preprocessing — Loudness + Silence-Trim
F5-TTS reagiert empfindlich auf leise / verrauschte / zerhackte
Referenzen — wir haben bisher nur auf 24kHz mono + 10s geclipped.
Jetzt zusaetzlich:
  - silenceremove am Anfang (bis Speech einsetzt, <-50dB)
  - silenceremove am Ende (0.5s Stille nach letzter Speech = Cutoff)
  - loudnorm -16 LUFS (EBU R128) fuer konsistente Amplitude

Damit sieht das Modell saubere, konstant laute Referenz-Audios statt
kaputter Clips mit Ausklang oder leiser Aufnahme. Besonders bei Deutsch
(wo F5TTS_v1_Base schwach ist) hilft jede Input-Konsistenz der Quali.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 19:07:58 +02:00
2 changed files with 64 additions and 24 deletions

View File

@ -469,23 +469,25 @@
Hardcoded Defaults: F5TTS_v1_Base, cfg_strength=2.5, nfe_step=32. Hardcoded Defaults: F5TTS_v1_Base, cfg_strength=2.5, nfe_step=32.
</div> </div>
<label style="color:#8888AA;font-size:12px;">Modell-ID:</label> <label style="color:#8888AA;font-size:12px;">
Modell-Architektur (F5TTS_v1_Base = Default multilingual, F5TTS_Base = fuer die meisten Fine-Tunes):
</label>
<input type="text" id="diag-f5tts-model" <input type="text" id="diag-f5tts-model"
placeholder="F5TTS_v1_Base" placeholder="F5TTS_v1_Base"
style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;"> style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;">
<label style="color:#8888AA;font-size:12px;"> <label style="color:#8888AA;font-size:12px;">
Custom Checkpoint (HF-Repo "user/repo" oder Container-Pfad, leer = Default): Custom Checkpoint — HF-Pfad (hf://user/repo/file) oder lokaler Container-Pfad. Leer = Default.
</label> </label>
<input type="text" id="diag-f5tts-ckpt" <input type="text" id="diag-f5tts-ckpt"
placeholder="z.B. aoxo/F5-TTS-German" placeholder="z.B. hf://aihpi/F5-TTS-German/F5TTS_Base/model_365000.safetensors"
style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;"> style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;">
<label style="color:#8888AA;font-size:12px;"> <label style="color:#8888AA;font-size:12px;">
Custom Vocab (passend zum Checkpoint, optional): Custom Vocab — muss zum Checkpoint passen. Leer = Default.
</label> </label>
<input type="text" id="diag-f5tts-vocab" <input type="text" id="diag-f5tts-vocab"
placeholder="leer = Default" placeholder="z.B. hf://aihpi/F5-TTS-German/vocab.txt"
style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;"> style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;">
<div style="display:flex;gap:12px;"> <div style="display:flex;gap:12px;">

View File

@ -99,6 +99,33 @@ def _get_f5tts_cls():
return _F5TTS_cls return _F5TTS_cls
def _resolve_hf_path(p: str) -> str:
"""Wenn p mit 'hf://' anfaengt → aus HuggingFace Hub runterladen,
lokalen Pfad zurueckgeben. Sonst unveraendert.
Format: hf://user/repo/path/to/file.ext
Beispiel: hf://aihpi/F5-TTS-German/F5TTS_Base/model_365000.safetensors
"""
if not p or not p.startswith("hf://"):
return p
try:
from huggingface_hub import hf_hub_download
rest = p[5:]
parts = rest.split("/", 2)
if len(parts) < 3:
logger.warning("Ungueltiges hf:// Format: %s (erwarte hf://user/repo/path)", p)
return p
repo_id = f"{parts[0]}/{parts[1]}"
filename = parts[2]
logger.info("HF-Download: %s aus %s", filename, repo_id)
local = hf_hub_download(repo_id=repo_id, filename=filename)
logger.info("HF-Download fertig: %s", local)
return local
except Exception as e:
logger.exception("HF-Download fehlgeschlagen fuer %s: %s", p, e)
return p
class F5Runner: class F5Runner:
"""Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking). """Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking).
@ -122,14 +149,16 @@ class F5Runner:
def _load_blocking(self) -> None: def _load_blocking(self) -> None:
cls = _get_f5tts_cls() cls = _get_f5tts_cls()
ckpt_resolved = _resolve_hf_path(self.ckpt_file) if self.ckpt_file else ""
vocab_resolved = _resolve_hf_path(self.vocab_file) if self.vocab_file else ""
logger.info("Lade F5-TTS '%s' (device=%s, ckpt=%s)...", logger.info("Lade F5-TTS '%s' (device=%s, ckpt=%s)...",
self.model_id, F5TTS_DEVICE, self.ckpt_file or "default") self.model_id, F5TTS_DEVICE, ckpt_resolved or "default")
self._load_started_at = time.time() self._load_started_at = time.time()
kwargs = {"model": self.model_id, "device": F5TTS_DEVICE} kwargs = {"model": self.model_id, "device": F5TTS_DEVICE}
if self.ckpt_file: if ckpt_resolved:
kwargs["ckpt_file"] = self.ckpt_file kwargs["ckpt_file"] = ckpt_resolved
if self.vocab_file: if vocab_resolved:
kwargs["vocab_file"] = self.vocab_file kwargs["vocab_file"] = vocab_resolved
self.model = cls(**kwargs) self.model = cls(**kwargs)
elapsed = time.time() - self._load_started_at elapsed = time.time() - self._load_started_at
logger.info("F5-TTS geladen in %.1fs (cfg_strength=%.1f, nfe=%d)", logger.info("F5-TTS geladen in %.1fs (cfg_strength=%.1f, nfe=%d)",
@ -256,39 +285,48 @@ def voice_paths(name: str) -> tuple[Path, Path]:
def normalize_ref_wav(src_wav: Path, max_seconds: float = REF_MAX_SECONDS) -> tuple[Path, bool]: def normalize_ref_wav(src_wav: Path, max_seconds: float = REF_MAX_SECONDS) -> tuple[Path, bool]:
"""Bringt die Referenz-WAV in F5-TTS-freundliche Form: """Bringt die Referenz-WAV in F5-TTS-freundliche Form:
24kHz mono + max max_seconds Dauer. Original wird ueberschrieben wenn
Aenderungen noetig waren. * 24kHz mono
* max max_seconds Dauer
* Stille am Anfang + Ende abgeschnitten (silenceremove-Filter)
* Lautheit auf -16 LUFS normalisiert (loudnorm-Filter) damit
das Modell konsistente Amplituden sieht
F5-TTS reagiert empfindlich auf leise / verrauschte / zerhackte
Referenzen. Konsistente, saubere Input-Lautheit hilft der Quali.
Returns: Returns:
(path, was_modified) was_modified=True wenn die Datei wirklich (path, was_modified) was_modified=True wenn die Datei wirklich
geaendert wurde (Caller sollte dann den passenden .txt invalidieren). geaendert wurde (Caller sollte dann den passenden .txt invalidieren).
""" """
try:
info = sf.info(str(src_wav))
# Schon gut? Sample-Rate, Kanaele und Dauer passen?
if (info.samplerate == TARGET_SR and info.channels == 1
and info.duration <= max_seconds + 0.1):
return src_wav, False
except Exception:
info = None
tmp_out = src_wav.with_suffix(".conv.wav") tmp_out = src_wav.with_suffix(".conv.wav")
# silenceremove am Anfang: bis -50dB gesprochen wird
# silenceremove am Ende: ueber -50dB rein, dann 0.5s stille als Cutoff
# loudnorm: EBU R128, Ziel -16 LUFS
af = ("silenceremove=start_periods=1:start_duration=0.05:start_threshold=-50dB,"
"silenceremove=stop_periods=1:stop_duration=0.5:stop_threshold=-50dB,"
"loudnorm=I=-16:TP=-1.5:LRA=11")
cmd = ["ffmpeg", "-y", "-i", str(src_wav), cmd = ["ffmpeg", "-y", "-i", str(src_wav),
"-af", af,
"-ar", str(TARGET_SR), "-ac", "1", "-ar", str(TARGET_SR), "-ac", "1",
"-t", str(max_seconds), "-t", str(max_seconds),
"-f", "wav", str(tmp_out)] "-f", "wav", str(tmp_out)]
r = subprocess.run(cmd, capture_output=True, timeout=30) r = subprocess.run(cmd, capture_output=True, timeout=30)
if r.returncode != 0: if r.returncode != 0:
logger.warning("ffmpeg-Normalisierung von %s fehlgeschlagen: %s", logger.warning("ffmpeg-Normalisierung von %s fehlgeschlagen: %s",
src_wav, r.stderr.decode(errors="replace")[:200]) src_wav, r.stderr.decode(errors="replace")[:300])
try: try:
tmp_out.unlink() tmp_out.unlink()
except OSError: except OSError:
pass pass
return src_wav, False return src_wav, False
os.replace(tmp_out, src_wav) os.replace(tmp_out, src_wav)
logger.info("Referenz-WAV normalisiert: %s (24kHz mono, max %.1fs)", try:
src_wav.name, max_seconds) info = sf.info(str(src_wav))
logger.info("Referenz-WAV normalisiert: %s (%.1fs, %dHz mono, -16 LUFS, silence getrimmt)",
src_wav.name, info.duration, info.samplerate)
except Exception:
logger.info("Referenz-WAV normalisiert: %s", src_wav.name)
return src_wav, True return src_wav, True