From a65ed579d25cce2e631fcd840995aecf61375172 Mon Sep 17 00:00:00 2001 From: duffyduck Date: Sat, 18 Apr 2026 11:37:27 +0200 Subject: [PATCH] feat: Whisper model selector + 16kHz mono recording MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - App: AudioSamplingRateAndroid 16000 + AudioChannelsAndroid 1 → Whisper bekommt direkt sein Ziel-Format, kein Resample mehr - Bridge: STTEngine.reload() laedt Modell zur Laufzeit neu (tiny/base/small/medium/large-v3) - Bridge: Config-Message triggert Hot-Reload wenn whisperModel sich aendert - Bridge: Default auf 'medium' (besser als 'small' bei aehnlicher Latenz) - Diagnostic: Neue Sektion "Whisper (Spracherkennung)" mit Dropdown, auto-save bei Auswahl, beim Laden wird der gespeicherte Wert gesetzt - Diagnostic/Server: send_voice_config merged whisperModel in voice_config.json - aria.env.example: WHISPER_MODEL + WHISPER_LANGUAGE dokumentiert Co-Authored-By: Claude Opus 4.7 (1M context) --- android/src/services/audio.ts | 2 ++ aria-data/config/aria.env.example | 7 ++++++ bridge/aria_bridge.py | 36 +++++++++++++++++++++++++++++-- diagnostic/index.html | 32 ++++++++++++++++++++++++++- diagnostic/server.js | 7 +++++- issue.md | 2 ++ 6 files changed, 82 insertions(+), 4 deletions(-) diff --git a/android/src/services/audio.ts b/android/src/services/audio.ts index 8227cbc..c17b6ae 100644 --- a/android/src/services/audio.ts +++ b/android/src/services/audio.ts @@ -127,6 +127,8 @@ class AudioService { AudioEncoderAndroid: AudioEncoderAndroidType.AAC, AudioSourceAndroid: AudioSourceAndroidType.MIC, OutputFormatAndroid: OutputFormatAndroidType.MPEG_4, + AudioSamplingRateAndroid: 16000, + AudioChannelsAndroid: 1, }, true); // meteringEnabled = true // Metering-Callback diff --git a/aria-data/config/aria.env.example b/aria-data/config/aria.env.example index eccfa90..0937d16 100644 --- a/aria-data/config/aria.env.example +++ b/aria-data/config/aria.env.example @@ -9,3 +9,10 @@ PIPER_THORSTEN=/voices/de_DE-thorsten-high.onnx # Wake-Word WAKE_WORD=aria + +# Whisper STT — wird zur Laufzeit in der Diagnostic (Sektion "Whisper") umgeschaltet +# und in /shared/config/voice_config.json gespeichert. Der Wert hier ist nur der +# Initial-Default beim ersten Start. +# Optionen: tiny | base | small | medium | large-v3 +WHISPER_MODEL=medium +WHISPER_LANGUAGE=de diff --git a/bridge/aria_bridge.py b/bridge/aria_bridge.py index 6a239fd..8100b8b 100644 --- a/bridge/aria_bridge.py +++ b/bridge/aria_bridge.py @@ -63,7 +63,7 @@ RVS_TLS = os.getenv("RVS_TLS", "true") # true = wss://, false = ws:// RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true") # Bei TLS-Fehler ws:// versuchen RVS_TOKEN = os.getenv("RVS_TOKEN", "") # Pairing-Token (gleich wie in der App) DIAGNOSTIC_URL = os.getenv("DIAGNOSTIC_URL", "http://127.0.0.1:3001") # Diagnostic API -WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small") +WHISPER_MODEL = os.getenv("WHISPER_MODEL", "medium") WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "de") # Audio-Parameter @@ -330,6 +330,25 @@ class STTEngine: self.model = WhisperModel(self.model_size, device="cpu", compute_type="int8") logger.info("Whisper-Modell geladen") + def reload(self, model_size: str) -> bool: + """Laedt ein anderes Whisper-Modell (bei Config-Aenderung).""" + if model_size == self.model_size and self.model is not None: + return False + allowed = {"tiny", "base", "small", "medium", "large-v3"} + if model_size not in allowed: + logger.warning("Ungueltiges Whisper-Modell: %s (erlaubt: %s)", model_size, allowed) + return False + logger.info("Lade Whisper-Modell neu: %s -> %s", self.model_size, model_size) + self.model_size = model_size + self.model = None + try: + self.model = WhisperModel(model_size, device="cpu", compute_type="int8") + logger.info("Whisper-Modell '%s' geladen", model_size) + return True + except Exception: + logger.exception("Whisper-Modell '%s' konnte nicht geladen werden", model_size) + return False + def transcribe(self, audio_data: np.ndarray) -> str: """Transkribiert Audio-Daten zu Text. @@ -502,6 +521,7 @@ class ARIABridge: # Komponenten self.voice_engine = VoiceEngine(VOICES_DIR) self.tts_enabled = True + vc: dict = {} # Gespeicherte Voice-Config laden try: vc_path = "/shared/config/voice_config.json" @@ -520,8 +540,10 @@ class ARIABridge: logger.info("Voice-Config geladen: %s", vc) except Exception as e: logger.warning("Voice-Config laden fehlgeschlagen: %s", e) + # Whisper-Modell: Config hat Vorrang, dann env/Default (medium) + whisper_model = vc.get("whisperModel") or self.config.get("WHISPER_MODEL", WHISPER_MODEL) self.stt_engine = STTEngine( - model_size=self.config.get("WHISPER_MODEL", WHISPER_MODEL), + model_size=whisper_model, language=self.config.get("WHISPER_LANGUAGE", WHISPER_LANGUAGE), ) self.wake_word = WakeWordDetector() @@ -1163,6 +1185,15 @@ class ARIABridge: self.voice_engine.speech_speed["thorsten"] = max(0.3, min(2.0, float(payload["speedThorsten"]))) logger.info("[rvs] Speed Thorsten: %.1f", self.voice_engine.speech_speed["thorsten"]) changed = True + whisper_reloaded = False + if "whisperModel" in payload: + new_model = payload["whisperModel"] + if new_model and new_model != self.stt_engine.model_size: + logger.info("[rvs] Whisper-Modell Wechsel: %s -> %s (laedt...)", self.stt_engine.model_size, new_model) + loop = asyncio.get_event_loop() + whisper_reloaded = await loop.run_in_executor(None, self.stt_engine.reload, new_model) + if whisper_reloaded: + changed = True # Persistent speichern in Shared Volume if changed: try: @@ -1175,6 +1206,7 @@ class ARIABridge: "xttsVoice": getattr(self, "xtts_voice", ""), "speedRamona": self.voice_engine.speech_speed.get("ramona", 1.0), "speedThorsten": self.voice_engine.speech_speed.get("thorsten", 1.0), + "whisperModel": self.stt_engine.model_size, } with open("/shared/config/voice_config.json", "w") as f: json.dump(config_data, f, indent=2) diff --git a/diagnostic/index.html b/diagnostic/index.html index 0f5c3d4..4438296 100644 --- a/diagnostic/index.html +++ b/diagnostic/index.html @@ -499,6 +499,30 @@ + +
+

Whisper (Spracherkennung)

+
+ Aenderungen werden sofort an die Bridge gesendet und das Modell neu geladen + (kann bei medium/large 10-30s dauern — waehrend dieser Zeit ist STT kurz pausiert). +
+
+
+ + +
+
+ Tipp: medium ist der beste Kompromiss fuer CPU. large-v3 nur bei GPU sinnvoll. +
+
+
+

Highlight-Trigger

@@ -763,6 +787,11 @@ } xttsSelect.value = xttsVoice; toggleXTTSPanel(); + // Whisper-Modell wiederherstellen (falls gesetzt) + if (msg.whisperModel) { + const wSel = document.getElementById('diag-whisper-model'); + if (wSel) wSel.value = msg.whisperModel; + } return; } @@ -1404,7 +1433,8 @@ const speedThorsten = parseFloat(document.getElementById('diag-speed-thorsten').value); const ttsEngine = document.getElementById('diag-tts-engine').value; const xttsVoice = document.getElementById('diag-xtts-voice').value; - send({ action: 'send_voice_config', defaultVoice, highlightVoice, ttsEnabled, speedRamona, speedThorsten, ttsEngine, xttsVoice }); + const whisperModel = document.getElementById('diag-whisper-model').value; + send({ action: 'send_voice_config', defaultVoice, highlightVoice, ttsEnabled, speedRamona, speedThorsten, ttsEngine, xttsVoice, whisperModel }); } // ── Highlight-Trigger ──────────────────────── diff --git a/diagnostic/server.js b/diagnostic/server.js index 8067fef..b35628f 100644 --- a/diagnostic/server.js +++ b/diagnostic/server.js @@ -1253,7 +1253,11 @@ wss.on("connection", (ws) => { handleGetVoiceConfig(ws); } else if (msg.action === "send_voice_config") { // Stimmen-Config persistent speichern + an Bridge via RVS senden + // Bestehende Config lesen um Felder zu mergen die dieser Call nicht setzt + let existing = {}; + try { existing = JSON.parse(fs.readFileSync("/shared/config/voice_config.json", "utf-8")); } catch {} const voiceConfig = { + ...existing, defaultVoice: msg.defaultVoice || "ramona", highlightVoice: msg.highlightVoice || "thorsten", ttsEnabled: msg.ttsEnabled !== false, @@ -1262,12 +1266,13 @@ wss.on("connection", (ws) => { speedRamona: msg.speedRamona || 1.0, speedThorsten: msg.speedThorsten || 1.0, }; + if (msg.whisperModel !== undefined) voiceConfig.whisperModel = msg.whisperModel; try { fs.mkdirSync("/shared/config", { recursive: true }); fs.writeFileSync("/shared/config/voice_config.json", JSON.stringify(voiceConfig, null, 2)); } catch {} sendToRVS_raw({ type: "config", payload: voiceConfig, timestamp: Date.now() }); - log("info", "server", `Voice-Config gespeichert+gesendet: default=${voiceConfig.defaultVoice}, highlight=${voiceConfig.highlightVoice}, tts=${voiceConfig.ttsEnabled}`); + log("info", "server", `Voice-Config gespeichert+gesendet: default=${voiceConfig.defaultVoice}, whisper=${voiceConfig.whisperModel || "-"}`); } else if (msg.action === "get_triggers") { handleGetTriggers(ws); } else if (msg.action === "save_triggers") { diff --git a/issue.md b/issue.md index 3a13483..4cad271 100644 --- a/issue.md +++ b/issue.md @@ -35,6 +35,8 @@ - [x] Session-Persistenz: Gewaehlte Session bleibt ueber Container-Restarts erhalten (sessionFromFile-Flag, atomic write) - [x] Diagnostic: "ARIA denkt..." bleibt nicht mehr stehen (pipelineEnd broadcastet immer idle, auch bei Timeout/Fehler/Disconnect) - [x] App: "ARIA denkt..." Indicator + Abbrechen-Button (Bridge spiegelt agent_activity via RVS) +- [x] Whisper STT: Model-Auswahl in Diagnostic (tiny/base/small/medium/large-v3), Hot-Reload in Bridge, Default auf medium +- [x] App: Audio-Aufnahme explizit 16kHz mono (spart Resample, optimal fuer Whisper) ## Offen