feat: Whisper model selector + 16kHz mono recording
- App: AudioSamplingRateAndroid 16000 + AudioChannelsAndroid 1 → Whisper bekommt direkt sein Ziel-Format, kein Resample mehr - Bridge: STTEngine.reload() laedt Modell zur Laufzeit neu (tiny/base/small/medium/large-v3) - Bridge: Config-Message triggert Hot-Reload wenn whisperModel sich aendert - Bridge: Default auf 'medium' (besser als 'small' bei aehnlicher Latenz) - Diagnostic: Neue Sektion "Whisper (Spracherkennung)" mit Dropdown, auto-save bei Auswahl, beim Laden wird der gespeicherte Wert gesetzt - Diagnostic/Server: send_voice_config merged whisperModel in voice_config.json - aria.env.example: WHISPER_MODEL + WHISPER_LANGUAGE dokumentiert Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+34
-2
@@ -63,7 +63,7 @@ RVS_TLS = os.getenv("RVS_TLS", "true") # true = wss://, false = ws://
|
||||
RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true") # Bei TLS-Fehler ws:// versuchen
|
||||
RVS_TOKEN = os.getenv("RVS_TOKEN", "") # Pairing-Token (gleich wie in der App)
|
||||
DIAGNOSTIC_URL = os.getenv("DIAGNOSTIC_URL", "http://127.0.0.1:3001") # Diagnostic API
|
||||
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small")
|
||||
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "medium")
|
||||
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "de")
|
||||
|
||||
# Audio-Parameter
|
||||
@@ -330,6 +330,25 @@ class STTEngine:
|
||||
self.model = WhisperModel(self.model_size, device="cpu", compute_type="int8")
|
||||
logger.info("Whisper-Modell geladen")
|
||||
|
||||
def reload(self, model_size: str) -> bool:
|
||||
"""Laedt ein anderes Whisper-Modell (bei Config-Aenderung)."""
|
||||
if model_size == self.model_size and self.model is not None:
|
||||
return False
|
||||
allowed = {"tiny", "base", "small", "medium", "large-v3"}
|
||||
if model_size not in allowed:
|
||||
logger.warning("Ungueltiges Whisper-Modell: %s (erlaubt: %s)", model_size, allowed)
|
||||
return False
|
||||
logger.info("Lade Whisper-Modell neu: %s -> %s", self.model_size, model_size)
|
||||
self.model_size = model_size
|
||||
self.model = None
|
||||
try:
|
||||
self.model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||
logger.info("Whisper-Modell '%s' geladen", model_size)
|
||||
return True
|
||||
except Exception:
|
||||
logger.exception("Whisper-Modell '%s' konnte nicht geladen werden", model_size)
|
||||
return False
|
||||
|
||||
def transcribe(self, audio_data: np.ndarray) -> str:
|
||||
"""Transkribiert Audio-Daten zu Text.
|
||||
|
||||
@@ -502,6 +521,7 @@ class ARIABridge:
|
||||
# Komponenten
|
||||
self.voice_engine = VoiceEngine(VOICES_DIR)
|
||||
self.tts_enabled = True
|
||||
vc: dict = {}
|
||||
# Gespeicherte Voice-Config laden
|
||||
try:
|
||||
vc_path = "/shared/config/voice_config.json"
|
||||
@@ -520,8 +540,10 @@ class ARIABridge:
|
||||
logger.info("Voice-Config geladen: %s", vc)
|
||||
except Exception as e:
|
||||
logger.warning("Voice-Config laden fehlgeschlagen: %s", e)
|
||||
# Whisper-Modell: Config hat Vorrang, dann env/Default (medium)
|
||||
whisper_model = vc.get("whisperModel") or self.config.get("WHISPER_MODEL", WHISPER_MODEL)
|
||||
self.stt_engine = STTEngine(
|
||||
model_size=self.config.get("WHISPER_MODEL", WHISPER_MODEL),
|
||||
model_size=whisper_model,
|
||||
language=self.config.get("WHISPER_LANGUAGE", WHISPER_LANGUAGE),
|
||||
)
|
||||
self.wake_word = WakeWordDetector()
|
||||
@@ -1163,6 +1185,15 @@ class ARIABridge:
|
||||
self.voice_engine.speech_speed["thorsten"] = max(0.3, min(2.0, float(payload["speedThorsten"])))
|
||||
logger.info("[rvs] Speed Thorsten: %.1f", self.voice_engine.speech_speed["thorsten"])
|
||||
changed = True
|
||||
whisper_reloaded = False
|
||||
if "whisperModel" in payload:
|
||||
new_model = payload["whisperModel"]
|
||||
if new_model and new_model != self.stt_engine.model_size:
|
||||
logger.info("[rvs] Whisper-Modell Wechsel: %s -> %s (laedt...)", self.stt_engine.model_size, new_model)
|
||||
loop = asyncio.get_event_loop()
|
||||
whisper_reloaded = await loop.run_in_executor(None, self.stt_engine.reload, new_model)
|
||||
if whisper_reloaded:
|
||||
changed = True
|
||||
# Persistent speichern in Shared Volume
|
||||
if changed:
|
||||
try:
|
||||
@@ -1175,6 +1206,7 @@ class ARIABridge:
|
||||
"xttsVoice": getattr(self, "xtts_voice", ""),
|
||||
"speedRamona": self.voice_engine.speech_speed.get("ramona", 1.0),
|
||||
"speedThorsten": self.voice_engine.speech_speed.get("thorsten", 1.0),
|
||||
"whisperModel": self.stt_engine.model_size,
|
||||
}
|
||||
with open("/shared/config/voice_config.json", "w") as f:
|
||||
json.dump(config_data, f, indent=2)
|
||||
|
||||
Reference in New Issue
Block a user