Compare commits
6 Commits
v0.0.5.6
...
8b52f4c92b
| Author | SHA1 | Date | |
|---|---|---|---|
| 8b52f4c92b | |||
| dc20570f6d | |||
| 744a27cfd1 | |||
| 37c5f6c368 | |||
| a361015ff4 | |||
| d83b555209 |
+41
-4
@@ -544,6 +544,10 @@ class ARIABridge:
|
||||
# STT-Requests die aktuell auf Antwort von der whisper-bridge (Gamebox) warten.
|
||||
# requestId → Future mit dem Text (oder None bei Fehler).
|
||||
self._pending_stt: dict[str, asyncio.Future] = {}
|
||||
# whisper-bridge service_status: True wenn ready, False/None wenn loading/unbekannt.
|
||||
# Beeinflusst das Timeout fuer stt_request — bei "loading" warten wir laenger,
|
||||
# weil das Modell beim ersten Request noch ~1-2 Min runtergeladen werden kann.
|
||||
self._remote_stt_ready: bool = False
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Initialisiert alle Komponenten.
|
||||
@@ -1442,13 +1446,41 @@ class ARIABridge:
|
||||
future.set_result(text)
|
||||
return
|
||||
|
||||
elif msg_type == "service_status":
|
||||
# Gamebox-Bridges (whisper / f5tts) melden ihren Lade-Status.
|
||||
# Wir nutzen das fuer den dynamischen STT-Timeout: solange whisper
|
||||
# im 'loading' steckt, geben wir der Bridge mehr Zeit (Modell-Download
|
||||
# kann 1-2 Min dauern), statt nach 45s lokal zu fallbacken.
|
||||
svc = payload.get("service", "")
|
||||
state = payload.get("state", "")
|
||||
if svc == "whisper":
|
||||
was_ready = self._remote_stt_ready
|
||||
self._remote_stt_ready = (state == "ready")
|
||||
if self._remote_stt_ready != was_ready:
|
||||
logger.info("[rvs] whisper-bridge -> %s", state)
|
||||
return
|
||||
|
||||
elif msg_type == "config_request":
|
||||
# Eine andere Bridge (whisper/f5tts) bittet um die aktuelle Voice-
|
||||
# Config — passiert wenn sie sich connected, weil sie sonst die
|
||||
# Diagnostic-Settings nicht kennt. Wir broadcasten die persistierte
|
||||
# Config (auch beim normalen Connect von aria-bridge selber, aber
|
||||
# da war eventuell die andere Bridge noch nicht connected).
|
||||
requester = payload.get("service", "?")
|
||||
logger.info("[rvs] config_request von %s — broadcaste Voice-Config", requester)
|
||||
asyncio.create_task(self._broadcast_persisted_config())
|
||||
return
|
||||
|
||||
else:
|
||||
logger.debug("[rvs] Unbekannter Typ: %s", msg_type)
|
||||
|
||||
# STT-Orchestrierung: zuerst Remote (Gamebox), Fallback lokal.
|
||||
# Timeout grosszuegig gewaehlt, damit auch ein erstmaliger Modell-Load
|
||||
# auf der Gamebox (bis ~30s bei large-v3) durchgeht.
|
||||
_STT_REMOTE_TIMEOUT_S = 45.0
|
||||
# Zwei Timeouts:
|
||||
# ready=True → 45s reicht selbst fuer lange Audios
|
||||
# ready=False → 300s, weil das Modell evtl. noch heruntergeladen wird
|
||||
# (large-v3 ~3GB, kann auf der Gamebox 1-2 Min dauern).
|
||||
_STT_REMOTE_TIMEOUT_READY_S = 45.0
|
||||
_STT_REMOTE_TIMEOUT_LOADING_S = 300.0
|
||||
|
||||
async def _process_app_audio(self, audio_b64: str, mime_type: str) -> None:
|
||||
"""App-Audio → STT → aria-core. Primaer via whisper-bridge (RVS), Fallback lokal."""
|
||||
@@ -1514,7 +1546,12 @@ class ARIABridge:
|
||||
if not ok:
|
||||
logger.warning("[rvs] stt_request konnte nicht gesendet werden — skip Remote")
|
||||
return None
|
||||
return await asyncio.wait_for(future, timeout=self._STT_REMOTE_TIMEOUT_S)
|
||||
timeout_s = (self._STT_REMOTE_TIMEOUT_READY_S
|
||||
if self._remote_stt_ready
|
||||
else self._STT_REMOTE_TIMEOUT_LOADING_S)
|
||||
logger.info("[rvs] STT-Timeout %ds (whisper-bridge %s)",
|
||||
int(timeout_s), "ready" if self._remote_stt_ready else "loading")
|
||||
return await asyncio.wait_for(future, timeout=timeout_s)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("[rvs] Remote-STT Timeout (%.0fs)", self._STT_REMOTE_TIMEOUT_S)
|
||||
return None
|
||||
|
||||
+5
-1
@@ -22,6 +22,7 @@ const ALLOWED_TYPES = new Set([
|
||||
"voice_preload", "voice_ready",
|
||||
"stt_request", "stt_response",
|
||||
"service_status",
|
||||
"config_request",
|
||||
]);
|
||||
|
||||
// Token-Raum: token -> { clients: Set<ws> }
|
||||
@@ -54,7 +55,10 @@ function cleanupRooms() {
|
||||
|
||||
// ── WebSocket-Server starten ────────────────────────────────────────
|
||||
|
||||
const wss = new WebSocketServer({ port: PORT });
|
||||
// maxPayload 50MB: TTS-Streaming + Voice-Upload (WAV als base64) +
|
||||
// audio_pcm Chunks koennen die ws-Library Default 1MB ueberschreiten.
|
||||
// Default-Limit war der Killer fuer die voice_upload Pipeline.
|
||||
const wss = new WebSocketServer({ port: PORT, maxPayload: 50 * 1024 * 1024 });
|
||||
|
||||
wss.on("listening", () => {
|
||||
log(`RVS läuft auf Port ${PORT} | Max Sessions: ${MAX_SESSIONS}`);
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# HuggingFace Model-Cache (Whisper + F5-TTS, geteilt zwischen den
|
||||
# beiden Bridges via Bind-Mount, kann mehrere GB werden)
|
||||
hf-cache/
|
||||
|
||||
# Voice-Samples (lokal, gehoert nicht ins Repo)
|
||||
voices/
|
||||
|
||||
|
||||
+10
-7
@@ -31,11 +31,11 @@ services:
|
||||
capabilities: [gpu]
|
||||
volumes:
|
||||
- ./voices:/voices # WAV + TXT Referenz
|
||||
# KEIN HF-Cache-Mount mehr —
|
||||
# Modell wird beim Start neu
|
||||
# gezogen. Diagnostic zeigt
|
||||
# "TTS laedt..." Banner bis
|
||||
# service_status: ready kommt.
|
||||
- ./hf-cache:/root/.cache/huggingface # HF-Cache als Bind-Mount.
|
||||
# Direkt sichtbar im xtts/hf-cache/,
|
||||
# einfach manuell zu loeschen, kein
|
||||
# Docker-Desktop .vhdx Bloat.
|
||||
# Wird mit whisper-bridge geteilt.
|
||||
environment:
|
||||
# Bootstrap-only — alle anderen F5-TTS-Settings (Modell, cfg_strength,
|
||||
# nfe_step, Custom-Checkpoint) kommen ueber Diagnostic via RVS-config.
|
||||
@@ -77,6 +77,9 @@ services:
|
||||
- WHISPER_DEVICE=${WHISPER_DEVICE:-cuda}
|
||||
- WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
|
||||
- WHISPER_LANGUAGE=${WHISPER_LANGUAGE:-de}
|
||||
# KEIN HF-Cache-Mount — Whisper-Modell wird beim Start neu gezogen.
|
||||
# Wechsel via Diagnostic triggert ebenso Re-Download.
|
||||
volumes:
|
||||
- ./hf-cache:/root/.cache/huggingface # gleicher Cache wie f5tts-bridge —
|
||||
# ein Modell muss nur einmal pro
|
||||
# Maschine geladen werden, kein
|
||||
# Re-Download bei Container-Restart.
|
||||
restart: unless-stopped
|
||||
|
||||
+66
-26
@@ -73,6 +73,12 @@ VOICES_DIR = Path(os.getenv("VOICES_DIR", "/voices"))
|
||||
|
||||
PCM_CHUNK_BYTES = 8192 # ~170ms @ 24kHz mono s16
|
||||
TARGET_SR = 24000 # F5-TTS native
|
||||
# F5-TTS hat ein 12s Hard-Limit fuer Referenz-Audio. Laengere WAVs werden
|
||||
# vom Modell stumm abgeschnitten — aber unser ref_text bleibt lang und passt
|
||||
# dann nicht mehr zum gekuerzten Audio (Quali leidet, warmup-Render ist
|
||||
# unnoetig lange). Wir clippen explizit auf 10s + re-transkribieren den Text
|
||||
# damit beide synchron bleiben.
|
||||
REF_MAX_SECONDS = 10.0
|
||||
|
||||
# Wird in einer Uebergangsphase als "ungueltige Referenz" erkannt (alte voices,
|
||||
# die hochgeladen wurden bevor die whisper-bridge online war). Bei Erkennung
|
||||
@@ -248,32 +254,42 @@ def voice_paths(name: str) -> tuple[Path, Path]:
|
||||
return VOICES_DIR / f"{safe}.wav", VOICES_DIR / f"{safe}.txt"
|
||||
|
||||
|
||||
def ensure_24k_mono_wav(src_wav: Path) -> Path:
|
||||
"""F5-TTS moechte 24kHz mono als Referenz — ffmpeg konvertiert inplace.
|
||||
def normalize_ref_wav(src_wav: Path, max_seconds: float = REF_MAX_SECONDS) -> tuple[Path, bool]:
|
||||
"""Bringt die Referenz-WAV in F5-TTS-freundliche Form:
|
||||
24kHz mono + max max_seconds Dauer. Original wird ueberschrieben wenn
|
||||
Aenderungen noetig waren.
|
||||
|
||||
Wenn das File schon passt, wird nichts geaendert. Sonst wird es
|
||||
reingeschrieben (Original wird ueberschrieben).
|
||||
Returns:
|
||||
(path, was_modified) — was_modified=True wenn die Datei wirklich
|
||||
geaendert wurde (Caller sollte dann den passenden .txt invalidieren).
|
||||
"""
|
||||
try:
|
||||
info = sf.info(str(src_wav))
|
||||
if info.samplerate == TARGET_SR and info.channels == 1:
|
||||
return src_wav
|
||||
# Schon gut? Sample-Rate, Kanaele und Dauer passen?
|
||||
if (info.samplerate == TARGET_SR and info.channels == 1
|
||||
and info.duration <= max_seconds + 0.1):
|
||||
return src_wav, False
|
||||
except Exception:
|
||||
pass
|
||||
info = None
|
||||
|
||||
tmp_out = src_wav.with_suffix(".conv.wav")
|
||||
cmd = ["ffmpeg", "-y", "-i", str(src_wav),
|
||||
"-ar", str(TARGET_SR), "-ac", "1", "-f", "wav", str(tmp_out)]
|
||||
"-ar", str(TARGET_SR), "-ac", "1",
|
||||
"-t", str(max_seconds),
|
||||
"-f", "wav", str(tmp_out)]
|
||||
r = subprocess.run(cmd, capture_output=True, timeout=30)
|
||||
if r.returncode != 0:
|
||||
logger.warning("ffmpeg-Konvertierung von %s fehlgeschlagen: %s",
|
||||
logger.warning("ffmpeg-Normalisierung von %s fehlgeschlagen: %s",
|
||||
src_wav, r.stderr.decode(errors="replace")[:200])
|
||||
try:
|
||||
tmp_out.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
return src_wav
|
||||
return src_wav, False
|
||||
os.replace(tmp_out, src_wav)
|
||||
return src_wav
|
||||
logger.info("Referenz-WAV normalisiert: %s (24kHz mono, max %.1fs)",
|
||||
src_wav.name, max_seconds)
|
||||
return src_wav, True
|
||||
|
||||
|
||||
async def _send(ws, mtype: str, payload: dict) -> None:
|
||||
@@ -349,6 +365,21 @@ async def _do_tts(ws, runner: F5Runner, text: str, voice: str,
|
||||
t0 = time.time()
|
||||
ref_wav_path, ref_txt_path = voice_paths(voice) if voice else (None, None)
|
||||
|
||||
# WAV zu lang? F5-TTS limitiert intern auf 12s, dann passt der txt nicht
|
||||
# mehr zum Audio. Wir clippen explizit auf 10s und invalidieren den txt,
|
||||
# damit er on-the-fly passend zum gekuerzten Audio neu transkribiert wird.
|
||||
if voice and ref_wav_path and ref_wav_path.exists():
|
||||
try:
|
||||
info = sf.info(str(ref_wav_path))
|
||||
if info.duration > REF_MAX_SECONDS + 0.5:
|
||||
logger.info("Voice '%s' WAV ist %.1fs (>%.0fs) → clippen + txt neu",
|
||||
voice, info.duration, REF_MAX_SECONDS)
|
||||
_, modified = normalize_ref_wav(ref_wav_path)
|
||||
if modified and ref_txt_path and ref_txt_path.exists():
|
||||
ref_txt_path.unlink()
|
||||
except Exception as e:
|
||||
logger.warning("Konnte WAV-Dauer nicht pruefen: %s", e)
|
||||
|
||||
# Legacy-Platzhalter erkennen → behandeln als "kein txt" und neu transkribieren
|
||||
if voice and ref_txt_path and ref_txt_path.exists():
|
||||
try:
|
||||
@@ -491,8 +522,9 @@ async def handle_voice_upload(ws, payload: dict) -> None:
|
||||
size_kb = wav_path.stat().st_size / 1024
|
||||
logger.info("Voice WAV gespeichert: %s (%.0fKB)", wav_path, size_kb)
|
||||
|
||||
# Auf 24kHz mono normalisieren (falls App in anderem Format liefert)
|
||||
ensure_24k_mono_wav(wav_path)
|
||||
# Auf 24kHz mono clippen auf 10s (F5-TTS Hard-Limit ist 12s,
|
||||
# kuerzer = schnellerer Warmup + Text+Audio bleiben aligned)
|
||||
normalize_ref_wav(wav_path)
|
||||
|
||||
# Transkription ueber whisper-bridge anfragen
|
||||
logger.info("Transkribiere '%s' via whisper-bridge...", name)
|
||||
@@ -613,22 +645,30 @@ async def run_loop(runner: F5Runner) -> None:
|
||||
tls_fallback_tried = False
|
||||
|
||||
# Status-Broadcast: erst loading, dann ready nach erfolgreichem Load.
|
||||
# Modell wird hier (nicht ausserhalb der Schleife) gestartet damit
|
||||
# der Loading-Status auch wirklich uebertragen werden kann.
|
||||
# Plus: config_request damit wir die persistierte Diagnostic-Config
|
||||
# bekommen, falls aria-bridge ihre nicht von alleine sendet.
|
||||
async def _load_with_status():
|
||||
if runner.model is not None:
|
||||
await _broadcast_status(ws, "ready",
|
||||
model=runner.model_id,
|
||||
loadSeconds=runner.last_load_seconds)
|
||||
return
|
||||
await _broadcast_status(ws, "loading", model=runner.model_id)
|
||||
try:
|
||||
await runner.ensure_loaded()
|
||||
await _broadcast_status(ws, "ready",
|
||||
model=runner.model_id,
|
||||
loadSeconds=runner.last_load_seconds)
|
||||
if runner.model is not None:
|
||||
logger.info("Initial: broadcaste ready (Modell schon im RAM: %s)", runner.model_id)
|
||||
await _broadcast_status(ws, "ready",
|
||||
model=runner.model_id,
|
||||
loadSeconds=runner.last_load_seconds)
|
||||
else:
|
||||
logger.info("Initial: broadcaste loading + lade Modell '%s'", runner.model_id)
|
||||
await _broadcast_status(ws, "loading", model=runner.model_id)
|
||||
await runner.ensure_loaded()
|
||||
await _broadcast_status(ws, "ready",
|
||||
model=runner.model_id,
|
||||
loadSeconds=runner.last_load_seconds)
|
||||
logger.info("Initial: sende config_request an aria-bridge")
|
||||
await _send(ws, "config_request", {"service": "f5tts"})
|
||||
except Exception as e:
|
||||
await _broadcast_status(ws, "error", error=str(e)[:200])
|
||||
logger.exception("Initial-Load crashed: %s", e)
|
||||
try:
|
||||
await _broadcast_status(ws, "error", error=str(e)[:200])
|
||||
except Exception:
|
||||
pass
|
||||
asyncio.create_task(_load_with_status())
|
||||
|
||||
# TTS-Worker fuer diese Verbindung starten
|
||||
|
||||
+38
-18
@@ -152,8 +152,17 @@ async def handle_stt_request(ws, payload: dict, runner: WhisperRunner) -> None:
|
||||
|
||||
try:
|
||||
t_load = time.time()
|
||||
# Falls Modell noch nicht geladen (Race-Condition: stt_request vor config)
|
||||
# → Status-Broadcast loading→ready damit der App-Banner aufpoppt
|
||||
needs_load = runner.model is None or runner.model_size != model
|
||||
if needs_load:
|
||||
await _broadcast_status(ws, "loading", model=model)
|
||||
await runner.ensure_loaded(model)
|
||||
load_ms = int((time.time() - t_load) * 1000)
|
||||
if needs_load:
|
||||
await _broadcast_status(ws, "ready",
|
||||
model=runner.model_size,
|
||||
loadSeconds=load_ms / 1000.0)
|
||||
|
||||
audio = ffmpeg_to_float32(audio_b64, mime_type)
|
||||
if audio.size == 0:
|
||||
@@ -203,27 +212,34 @@ async def run_loop(runner: WhisperRunner) -> None:
|
||||
masked = url.replace(RVS_TOKEN, "***") if RVS_TOKEN else url
|
||||
try:
|
||||
logger.info("Verbinde zu RVS: %s", masked)
|
||||
async with websockets.connect(url, ping_interval=20, ping_timeout=10) as ws:
|
||||
# max_size 50MB damit grosse stt_request (Voice-Cloning-WAVs als
|
||||
# base64 koennen mehrere MB werden) nicht das Frame-Limit sprengen
|
||||
# und die Verbindung mit 1009 'message too big' killen.
|
||||
async with websockets.connect(url, ping_interval=20, ping_timeout=10, max_size=50 * 1024 * 1024) as ws:
|
||||
logger.info("RVS verbunden")
|
||||
retry_s = 2
|
||||
tls_fallback_tried = False
|
||||
|
||||
# Modell laden, dabei loading→ready broadcasten
|
||||
async def _load_with_status():
|
||||
if runner.model is not None:
|
||||
await _broadcast_status(ws, "ready", model=runner.model_size)
|
||||
return
|
||||
await _broadcast_status(ws, "loading", model=WHISPER_MODEL)
|
||||
# Initialer Status-Broadcast — uebertont alten "ready"-State
|
||||
# im App/Diagnostic Banner (sonst denkt der User noch alles ist
|
||||
# gut von vorher). Wenn Modell schon geladen → ready, sonst
|
||||
# loading mit aktuellem (Default-)Namen.
|
||||
# Plus: config_request an aria-bridge — wir wissen nicht ob
|
||||
# sie auch grad reconnected hat oder schon laenger online ist.
|
||||
async def _initial_handshake():
|
||||
try:
|
||||
t0 = time.time()
|
||||
await runner.ensure_loaded(WHISPER_MODEL)
|
||||
elapsed = time.time() - t0
|
||||
await _broadcast_status(ws, "ready",
|
||||
model=runner.model_size,
|
||||
loadSeconds=elapsed)
|
||||
if runner.model is not None:
|
||||
logger.info("Initial: broadcaste ready (Modell schon im RAM: %s)", runner.model_size)
|
||||
await _broadcast_status(ws, "ready", model=runner.model_size)
|
||||
else:
|
||||
init_model = runner.model_size or WHISPER_MODEL
|
||||
logger.info("Initial: broadcaste loading (model=%s)", init_model)
|
||||
await _broadcast_status(ws, "loading", model=init_model)
|
||||
logger.info("Initial: sende config_request an aria-bridge")
|
||||
await _send(ws, "config_request", {"service": "whisper"})
|
||||
except Exception as e:
|
||||
await _broadcast_status(ws, "error", error=str(e)[:200])
|
||||
asyncio.create_task(_load_with_status())
|
||||
logger.exception("Initial-Handshake crashed: %s", e)
|
||||
asyncio.create_task(_initial_handshake())
|
||||
|
||||
async for raw in ws:
|
||||
try:
|
||||
@@ -240,9 +256,13 @@ async def run_loop(runner: WhisperRunner) -> None:
|
||||
req_id[:8] if req_id != "?" else "?", audio_len // 1365)
|
||||
asyncio.create_task(handle_stt_request(ws, payload, runner))
|
||||
elif mtype == "config":
|
||||
new_model = payload.get("whisperModel")
|
||||
if new_model and new_model != runner.model_size:
|
||||
logger.info("Config-Broadcast: Whisper-Modell -> %s", new_model)
|
||||
new_model = payload.get("whisperModel") or WHISPER_MODEL
|
||||
# Laden wenn (a) noch nix geladen, oder (b) Modell wechselt
|
||||
needs_load = (runner.model is None) or (new_model != runner.model_size)
|
||||
if needs_load:
|
||||
logger.info("Config-Broadcast: Whisper-Modell -> %s%s",
|
||||
new_model,
|
||||
" (initial)" if runner.model is None else " (Wechsel)")
|
||||
async def _swap_with_status(target):
|
||||
await _broadcast_status(ws, "loading", model=target)
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user