diff --git a/bridge/aria_bridge.py b/bridge/aria_bridge.py index 41baf6f..2972c20 100644 --- a/bridge/aria_bridge.py +++ b/bridge/aria_bridge.py @@ -2520,6 +2520,59 @@ class ARIABridge: future.set_result(text) return + elif msg_type == "stt_endpoint": + # Phase 2 Brain-Shortcut: die whisper-bridge hat im Streaming-Modus + # einen Endpoint erkannt und schickt den finalen Text direkt. + # Wir uebernehmen die Rolle die sonst _process_app_audio NACH dem + # STT-Schritt hat: STT-Text fuer UI broadcasten + send_to_core. + # Kein Audio-Roundtrip mehr — App-Latenz sinkt deutlich. + text = (payload.get("text") or "").strip() + if not text: + logger.info("[rvs] stt_endpoint mit leerem Text — ignoriert (reason=%s)", + payload.get("reason", "")) + return + audio_request_id = payload.get("audioRequestId", "") or "" + voice = payload.get("voice", "") or "" + speed_raw = payload.get("speed") + interrupted = bool(payload.get("interrupted", False)) + location = payload.get("location") or None + + # Voice-Override fuer Folgenachrichten — gleiche Semantik wie beim + # 'audio'-Event. Nur setzen wenn vom App-Stream mitgegeben. + if voice: + self._next_voice_override = voice or None + logger.info("[rvs] Voice fuer Antworten (via stt_endpoint): %s", + self._next_voice_override or "(Default)") + if speed_raw is not None: + try: + sp = float(speed_raw) + self._next_speed_override = sp if 0.1 <= sp <= 5.0 else None + except (TypeError, ValueError): + self._next_speed_override = None + + # State-Persist wie bei _process_app_audio + self._persist_location(location) + self._persist_user_activity() + + logger.info("[rvs] stt_endpoint: '%s' (%dms, reason=%s)%s%s reqId=%s", + text[:80], + payload.get("sttMs", 0), + payload.get("reason", ""), + " [BARGE-IN]" if interrupted else "", + " [GPS]" if location else "", + audio_request_id[:16] if audio_request_id else "?") + + # Idempotenz ueber audioRequestId — falls App den Stream irgendwie + # nochmal triggern sollte (Reconnect-Race etc.). + client_msg_id = audio_request_id or None + if self._is_duplicate_client_msg(client_msg_id): + return + + asyncio.create_task(self._process_endpoint_text( + text, interrupted, audio_request_id, location, + client_msg_id=client_msg_id)) + return + elif msg_type == "oauth_callback": # RVS hat einen OAuth-Provider-Callback empfangen (z.B. Spotify # nach User-Authorize) und broadcastet ihn. Wir forwarden an Brain, @@ -2662,6 +2715,44 @@ class ARIABridge: else: logger.info("[rvs] Keine Sprache erkannt — ignoriert") + async def _process_endpoint_text(self, text: str, + interrupted: bool = False, + audio_request_id: str = "", + location: Optional[dict] = None, + client_msg_id: Optional[str] = None) -> None: + """Phase-2 Brain-Shortcut: Streaming-Whisper hat den finalen Text + schon ermittelt — wir uebernehmen den Pfad ab broadcast-STT + brain. + + Spiegel-Methode zu _process_app_audio NACH dem STT-Schritt. Bewusst + eigene Methode statt Code-Pfade in _process_app_audio aufdroeseln, + damit der Legacy-Pfad (App schickt 'audio') unangetastet bleibt. + """ + try: + stt_payload = { + "text": text, + "sender": "stt", + } + if audio_request_id: + stt_payload["audioRequestId"] = audio_request_id + if location: + stt_payload["location"] = location + ok = await self._send_to_rvs({ + "type": "chat", + "payload": stt_payload, + "timestamp": int(asyncio.get_event_loop().time() * 1000), + }) + if ok: + logger.info("[rvs] STT-Text (endpoint) broadcastet") + else: + logger.warning("[rvs] STT-Text (endpoint) NICHT broadcastet") + except Exception as e: + logger.warning("[rvs] STT-Text (endpoint) konnte nicht broadcastet werden: %s", e) + + core_text = self._build_core_text(text, interrupted, location) + await self.send_to_core(core_text, + source="app-voice-stream" + (" [barge-in]" if interrupted else ""), + client_msg_id=client_msg_id) + async def _stt_remote(self, audio_b64: str, mime_type: str) -> Optional[str]: """Schickt Audio an die whisper-bridge und wartet auf stt_response.