feat(bridge): stt_endpoint-Handler — Phase 2 Brain-Shortcut
Empfaengt das stt_endpoint-Event der Streaming-Whisper-Bridge und uebernimmt den Pfad den sonst _process_app_audio NACH dem STT-Schritt hat: broadcastet chat(sender=stt) fuer die App-UI-Bubble, baut den Core-Text und ruft send_to_core(). Damit faellt der Audio-Roundtrip App→aria→whisper→aria komplett weg — die App schickt nur noch PCM-Chunks direkt an whisper-bridge, whisper meldet Endpoint, aria forwarded sofort an Brain. Echos voice/speed/interrupted/location aus dem App-Payload werden respektiert wie beim Legacy 'audio'-Event. clean_text_for_tts + ttsText-Embedding bleiben unveraendert da der TTS-Pfad ueber das bestehende send_to_core laeuft. Idempotenz via audioRequestId als client_msg_id — falls die App den Stream durch einen Reconnect-Race nochmal triggern sollte. source-Tag fuer den Brain-Log: "app-voice-stream" statt "app-voice" damit man im Brain-Log sehen kann ob via Legacy- oder Stream-Pfad.
This commit is contained in:
@@ -2520,6 +2520,59 @@ class ARIABridge:
|
||||
future.set_result(text)
|
||||
return
|
||||
|
||||
elif msg_type == "stt_endpoint":
|
||||
# Phase 2 Brain-Shortcut: die whisper-bridge hat im Streaming-Modus
|
||||
# einen Endpoint erkannt und schickt den finalen Text direkt.
|
||||
# Wir uebernehmen die Rolle die sonst _process_app_audio NACH dem
|
||||
# STT-Schritt hat: STT-Text fuer UI broadcasten + send_to_core.
|
||||
# Kein Audio-Roundtrip mehr — App-Latenz sinkt deutlich.
|
||||
text = (payload.get("text") or "").strip()
|
||||
if not text:
|
||||
logger.info("[rvs] stt_endpoint mit leerem Text — ignoriert (reason=%s)",
|
||||
payload.get("reason", ""))
|
||||
return
|
||||
audio_request_id = payload.get("audioRequestId", "") or ""
|
||||
voice = payload.get("voice", "") or ""
|
||||
speed_raw = payload.get("speed")
|
||||
interrupted = bool(payload.get("interrupted", False))
|
||||
location = payload.get("location") or None
|
||||
|
||||
# Voice-Override fuer Folgenachrichten — gleiche Semantik wie beim
|
||||
# 'audio'-Event. Nur setzen wenn vom App-Stream mitgegeben.
|
||||
if voice:
|
||||
self._next_voice_override = voice or None
|
||||
logger.info("[rvs] Voice fuer Antworten (via stt_endpoint): %s",
|
||||
self._next_voice_override or "(Default)")
|
||||
if speed_raw is not None:
|
||||
try:
|
||||
sp = float(speed_raw)
|
||||
self._next_speed_override = sp if 0.1 <= sp <= 5.0 else None
|
||||
except (TypeError, ValueError):
|
||||
self._next_speed_override = None
|
||||
|
||||
# State-Persist wie bei _process_app_audio
|
||||
self._persist_location(location)
|
||||
self._persist_user_activity()
|
||||
|
||||
logger.info("[rvs] stt_endpoint: '%s' (%dms, reason=%s)%s%s reqId=%s",
|
||||
text[:80],
|
||||
payload.get("sttMs", 0),
|
||||
payload.get("reason", ""),
|
||||
" [BARGE-IN]" if interrupted else "",
|
||||
" [GPS]" if location else "",
|
||||
audio_request_id[:16] if audio_request_id else "?")
|
||||
|
||||
# Idempotenz ueber audioRequestId — falls App den Stream irgendwie
|
||||
# nochmal triggern sollte (Reconnect-Race etc.).
|
||||
client_msg_id = audio_request_id or None
|
||||
if self._is_duplicate_client_msg(client_msg_id):
|
||||
return
|
||||
|
||||
asyncio.create_task(self._process_endpoint_text(
|
||||
text, interrupted, audio_request_id, location,
|
||||
client_msg_id=client_msg_id))
|
||||
return
|
||||
|
||||
elif msg_type == "oauth_callback":
|
||||
# RVS hat einen OAuth-Provider-Callback empfangen (z.B. Spotify
|
||||
# nach User-Authorize) und broadcastet ihn. Wir forwarden an Brain,
|
||||
@@ -2662,6 +2715,44 @@ class ARIABridge:
|
||||
else:
|
||||
logger.info("[rvs] Keine Sprache erkannt — ignoriert")
|
||||
|
||||
async def _process_endpoint_text(self, text: str,
|
||||
interrupted: bool = False,
|
||||
audio_request_id: str = "",
|
||||
location: Optional[dict] = None,
|
||||
client_msg_id: Optional[str] = None) -> None:
|
||||
"""Phase-2 Brain-Shortcut: Streaming-Whisper hat den finalen Text
|
||||
schon ermittelt — wir uebernehmen den Pfad ab broadcast-STT + brain.
|
||||
|
||||
Spiegel-Methode zu _process_app_audio NACH dem STT-Schritt. Bewusst
|
||||
eigene Methode statt Code-Pfade in _process_app_audio aufdroeseln,
|
||||
damit der Legacy-Pfad (App schickt 'audio') unangetastet bleibt.
|
||||
"""
|
||||
try:
|
||||
stt_payload = {
|
||||
"text": text,
|
||||
"sender": "stt",
|
||||
}
|
||||
if audio_request_id:
|
||||
stt_payload["audioRequestId"] = audio_request_id
|
||||
if location:
|
||||
stt_payload["location"] = location
|
||||
ok = await self._send_to_rvs({
|
||||
"type": "chat",
|
||||
"payload": stt_payload,
|
||||
"timestamp": int(asyncio.get_event_loop().time() * 1000),
|
||||
})
|
||||
if ok:
|
||||
logger.info("[rvs] STT-Text (endpoint) broadcastet")
|
||||
else:
|
||||
logger.warning("[rvs] STT-Text (endpoint) NICHT broadcastet")
|
||||
except Exception as e:
|
||||
logger.warning("[rvs] STT-Text (endpoint) konnte nicht broadcastet werden: %s", e)
|
||||
|
||||
core_text = self._build_core_text(text, interrupted, location)
|
||||
await self.send_to_core(core_text,
|
||||
source="app-voice-stream" + (" [barge-in]" if interrupted else ""),
|
||||
client_msg_id=client_msg_id)
|
||||
|
||||
async def _stt_remote(self, audio_b64: str, mime_type: str) -> Optional[str]:
|
||||
"""Schickt Audio an die whisper-bridge und wartet auf stt_response.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user