feat: F5-TTS Tuning ueber Diagnostic statt .env

Folgt der "keine neuen Settings in .env" Regel.

f5tts/bridge.py:
  - F5TTS_MODEL/CKPT_FILE/VOCAB_FILE/CFG_STRENGTH/NFE_STEP ENV-Vars raus
  - Hard-coded Defaults im Code (DEFAULT_F5TTS_*)
  - F5Runner besitzt Live-Settings als Instance-Vars + update_config()
  - config-Broadcast triggert Modell-Reload nur wenn Modell-relevantes
    sich aendert (cfg_strength/nfe_step ohne Reload)
  - F5TTS_DEVICE bleibt ENV (Hardware-Bootstrap)

xtts/docker-compose.yml: F5TTS_* ENV-Vars rausgenommen, Kommentar
verweist auf Diagnostic-Config.

aria-bridge: nimmt f5tts*-Felder im config-Handler entgegen, persistiert
sie in voice_config.json. Beim RVS-Connect broadcastet die Bridge die
persistierte Config einmalig — damit die f5tts-bridge nach Container-
Restart automatisch die zuletzt gewaehlten Settings bekommt, ohne dass
der User in Diagnostic was klicken muss.

Diagnostic UI:
  - Neuer aufklappbarer "F5-TTS Modell-Tuning (advanced)" Bereich
  - Felder: Modell-ID, Custom-Checkpoint, Vocab, cfg_strength, nfe_step
  - voice_config beim Laden: Felder werden zurueck in die UI gesetzt
  - sendVoiceConfig schickt die neuen Felder mit
  - Server: send_voice_config persistiert die Felder, leere Strings
    werden geloescht damit die Hard-Defaults greifen

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-24 15:44:58 +02:00
parent 467f95424e
commit 187ffad7ee
5 changed files with 231 additions and 32 deletions
+75 -1
View File
@@ -450,6 +450,58 @@
<!-- Gecloned Stimmen — Liste mit Loeschen -->
<div id="xtts-voice-list" style="margin-bottom:12px;"></div>
<!-- F5-TTS Modell-Tuning -->
<details style="background:#0D0D1A;border:1px solid #2A2A3E;border-radius:6px;padding:10px 12px;margin-bottom:12px;">
<summary style="color:#8888AA;font-size:12px;cursor:pointer;">F5-TTS Modell-Tuning (advanced)</summary>
<div style="margin-top:10px;display:flex;flex-direction:column;gap:8px;">
<div style="color:#8888AA;font-size:11px;">
Werden via RVS an die f5tts-bridge auf der Gamebox geschickt.
Modell-/Checkpoint-Wechsel triggert einen Reload (~30s).
Hardcoded Defaults: F5TTS_v1_Base, cfg_strength=2.5, nfe_step=32.
</div>
<label style="color:#8888AA;font-size:12px;">Modell-ID:</label>
<input type="text" id="diag-f5tts-model"
placeholder="F5TTS_v1_Base"
style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;">
<label style="color:#8888AA;font-size:12px;">
Custom Checkpoint (HF-Repo "user/repo" oder Container-Pfad, leer = Default):
</label>
<input type="text" id="diag-f5tts-ckpt"
placeholder="z.B. aoxo/F5-TTS-German"
style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;">
<label style="color:#8888AA;font-size:12px;">
Custom Vocab (passend zum Checkpoint, optional):
</label>
<input type="text" id="diag-f5tts-vocab"
placeholder="leer = Default"
style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;">
<div style="display:flex;gap:12px;">
<div style="flex:1;">
<label style="color:#8888AA;font-size:12px;">cfg_strength (1.0 - 5.0):</label>
<input type="number" id="diag-f5tts-cfg" step="0.1" min="1" max="5"
placeholder="2.5"
style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;width:100%;box-sizing:border-box;">
<div style="color:#666680;font-size:10px;">Hoeher = klebt staerker an Referenz</div>
</div>
<div style="flex:1;">
<label style="color:#8888AA;font-size:12px;">nfe_step (8 - 64):</label>
<input type="number" id="diag-f5tts-nfe" step="1" min="8" max="64"
placeholder="32"
style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;width:100%;box-sizing:border-box;">
<div style="color:#666680;font-size:10px;">Hoeher = bessere Qualitaet, langsamer</div>
</div>
</div>
<button class="btn primary" onclick="sendVoiceConfig()" style="padding:6px 14px;font-size:12px;align-self:flex-start;margin-top:6px;">
Anwenden
</button>
</div>
</details>
<!-- Voice Cloning -->
<div style="background:#1E1E2E;border-radius:8px;padding:12px;margin-top:8px;">
<div style="color:#0096FF;font-size:13px;font-weight:600;margin-bottom:8px;">Stimme klonen</div>
@@ -841,6 +893,16 @@
const wSel = document.getElementById('diag-whisper-model');
if (wSel) wSel.value = msg.whisperModel;
}
// F5-TTS Tuning-Felder wiederherstellen (falls gesetzt)
const setIfPresent = (id, val) => {
const el = document.getElementById(id);
if (el && val !== undefined && val !== null && val !== '') el.value = val;
};
setIfPresent('diag-f5tts-model', msg.f5ttsModel);
setIfPresent('diag-f5tts-ckpt', msg.f5ttsCkptFile);
setIfPresent('diag-f5tts-vocab', msg.f5ttsVocabFile);
setIfPresent('diag-f5tts-cfg', msg.f5ttsCfgStrength);
setIfPresent('diag-f5tts-nfe', msg.f5ttsNfeStep);
return;
}
@@ -1570,7 +1632,19 @@
const ttsEnabled = document.getElementById('diag-tts-enabled').checked;
const xttsVoice = document.getElementById('diag-xtts-voice').value;
const whisperModel = document.getElementById('diag-whisper-model').value;
send({ action: 'send_voice_config', ttsEnabled, xttsVoice, whisperModel });
const f5ttsModel = document.getElementById('diag-f5tts-model')?.value || '';
const f5ttsCkptFile = document.getElementById('diag-f5tts-ckpt')?.value || '';
const f5ttsVocabFile = document.getElementById('diag-f5tts-vocab')?.value || '';
const f5ttsCfgRaw = document.getElementById('diag-f5tts-cfg')?.value || '';
const f5ttsNfeRaw = document.getElementById('diag-f5tts-nfe')?.value || '';
const f5ttsCfgStrength = f5ttsCfgRaw ? parseFloat(f5ttsCfgRaw) : undefined;
const f5ttsNfeStep = f5ttsNfeRaw ? parseInt(f5ttsNfeRaw, 10) : undefined;
send({
action: 'send_voice_config',
ttsEnabled, xttsVoice, whisperModel,
f5ttsModel, f5ttsCkptFile, f5ttsVocabFile,
f5ttsCfgStrength, f5ttsNfeStep,
});
const statusEl = document.getElementById('voice-status');
if (statusEl && xttsVoice) {
statusEl.textContent = `⏳ Stimme "${xttsVoice}" wird geladen...`;
+19
View File
@@ -1423,6 +1423,25 @@ wss.on("connection", (ws) => {
xttsVoice: msg.xttsVoice || "",
};
if (msg.whisperModel !== undefined) voiceConfig.whisperModel = msg.whisperModel;
// F5-TTS Tuning-Felder — leere Strings entfernen damit der Default greift
if (msg.f5ttsModel !== undefined) {
if (msg.f5ttsModel) voiceConfig.f5ttsModel = msg.f5ttsModel;
else delete voiceConfig.f5ttsModel;
}
if (msg.f5ttsCkptFile !== undefined) {
if (msg.f5ttsCkptFile) voiceConfig.f5ttsCkptFile = msg.f5ttsCkptFile;
else delete voiceConfig.f5ttsCkptFile;
}
if (msg.f5ttsVocabFile !== undefined) {
if (msg.f5ttsVocabFile) voiceConfig.f5ttsVocabFile = msg.f5ttsVocabFile;
else delete voiceConfig.f5ttsVocabFile;
}
if (msg.f5ttsCfgStrength !== undefined && !isNaN(msg.f5ttsCfgStrength)) {
voiceConfig.f5ttsCfgStrength = msg.f5ttsCfgStrength;
}
if (msg.f5ttsNfeStep !== undefined && !isNaN(msg.f5ttsNfeStep)) {
voiceConfig.f5ttsNfeStep = msg.f5ttsNfeStep;
}
try {
fs.mkdirSync("/shared/config", { recursive: true });
fs.writeFileSync("/shared/config/voice_config.json", JSON.stringify(voiceConfig, null, 2));