feat(bridge): stt_endpoint-Handler — Phase 2 Brain-Shortcut
Empfaengt das stt_endpoint-Event der Streaming-Whisper-Bridge und uebernimmt den Pfad den sonst _process_app_audio NACH dem STT-Schritt hat: broadcastet chat(sender=stt) fuer die App-UI-Bubble, baut den Core-Text und ruft send_to_core(). Damit faellt der Audio-Roundtrip App→aria→whisper→aria komplett weg — die App schickt nur noch PCM-Chunks direkt an whisper-bridge, whisper meldet Endpoint, aria forwarded sofort an Brain. Echos voice/speed/interrupted/location aus dem App-Payload werden respektiert wie beim Legacy 'audio'-Event. clean_text_for_tts + ttsText-Embedding bleiben unveraendert da der TTS-Pfad ueber das bestehende send_to_core laeuft. Idempotenz via audioRequestId als client_msg_id — falls die App den Stream durch einen Reconnect-Race nochmal triggern sollte. source-Tag fuer den Brain-Log: "app-voice-stream" statt "app-voice" damit man im Brain-Log sehen kann ob via Legacy- oder Stream-Pfad.
This commit is contained in:
@@ -2520,6 +2520,59 @@ class ARIABridge:
|
|||||||
future.set_result(text)
|
future.set_result(text)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
elif msg_type == "stt_endpoint":
|
||||||
|
# Phase 2 Brain-Shortcut: die whisper-bridge hat im Streaming-Modus
|
||||||
|
# einen Endpoint erkannt und schickt den finalen Text direkt.
|
||||||
|
# Wir uebernehmen die Rolle die sonst _process_app_audio NACH dem
|
||||||
|
# STT-Schritt hat: STT-Text fuer UI broadcasten + send_to_core.
|
||||||
|
# Kein Audio-Roundtrip mehr — App-Latenz sinkt deutlich.
|
||||||
|
text = (payload.get("text") or "").strip()
|
||||||
|
if not text:
|
||||||
|
logger.info("[rvs] stt_endpoint mit leerem Text — ignoriert (reason=%s)",
|
||||||
|
payload.get("reason", ""))
|
||||||
|
return
|
||||||
|
audio_request_id = payload.get("audioRequestId", "") or ""
|
||||||
|
voice = payload.get("voice", "") or ""
|
||||||
|
speed_raw = payload.get("speed")
|
||||||
|
interrupted = bool(payload.get("interrupted", False))
|
||||||
|
location = payload.get("location") or None
|
||||||
|
|
||||||
|
# Voice-Override fuer Folgenachrichten — gleiche Semantik wie beim
|
||||||
|
# 'audio'-Event. Nur setzen wenn vom App-Stream mitgegeben.
|
||||||
|
if voice:
|
||||||
|
self._next_voice_override = voice or None
|
||||||
|
logger.info("[rvs] Voice fuer Antworten (via stt_endpoint): %s",
|
||||||
|
self._next_voice_override or "(Default)")
|
||||||
|
if speed_raw is not None:
|
||||||
|
try:
|
||||||
|
sp = float(speed_raw)
|
||||||
|
self._next_speed_override = sp if 0.1 <= sp <= 5.0 else None
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
self._next_speed_override = None
|
||||||
|
|
||||||
|
# State-Persist wie bei _process_app_audio
|
||||||
|
self._persist_location(location)
|
||||||
|
self._persist_user_activity()
|
||||||
|
|
||||||
|
logger.info("[rvs] stt_endpoint: '%s' (%dms, reason=%s)%s%s reqId=%s",
|
||||||
|
text[:80],
|
||||||
|
payload.get("sttMs", 0),
|
||||||
|
payload.get("reason", ""),
|
||||||
|
" [BARGE-IN]" if interrupted else "",
|
||||||
|
" [GPS]" if location else "",
|
||||||
|
audio_request_id[:16] if audio_request_id else "?")
|
||||||
|
|
||||||
|
# Idempotenz ueber audioRequestId — falls App den Stream irgendwie
|
||||||
|
# nochmal triggern sollte (Reconnect-Race etc.).
|
||||||
|
client_msg_id = audio_request_id or None
|
||||||
|
if self._is_duplicate_client_msg(client_msg_id):
|
||||||
|
return
|
||||||
|
|
||||||
|
asyncio.create_task(self._process_endpoint_text(
|
||||||
|
text, interrupted, audio_request_id, location,
|
||||||
|
client_msg_id=client_msg_id))
|
||||||
|
return
|
||||||
|
|
||||||
elif msg_type == "oauth_callback":
|
elif msg_type == "oauth_callback":
|
||||||
# RVS hat einen OAuth-Provider-Callback empfangen (z.B. Spotify
|
# RVS hat einen OAuth-Provider-Callback empfangen (z.B. Spotify
|
||||||
# nach User-Authorize) und broadcastet ihn. Wir forwarden an Brain,
|
# nach User-Authorize) und broadcastet ihn. Wir forwarden an Brain,
|
||||||
@@ -2662,6 +2715,44 @@ class ARIABridge:
|
|||||||
else:
|
else:
|
||||||
logger.info("[rvs] Keine Sprache erkannt — ignoriert")
|
logger.info("[rvs] Keine Sprache erkannt — ignoriert")
|
||||||
|
|
||||||
|
async def _process_endpoint_text(self, text: str,
|
||||||
|
interrupted: bool = False,
|
||||||
|
audio_request_id: str = "",
|
||||||
|
location: Optional[dict] = None,
|
||||||
|
client_msg_id: Optional[str] = None) -> None:
|
||||||
|
"""Phase-2 Brain-Shortcut: Streaming-Whisper hat den finalen Text
|
||||||
|
schon ermittelt — wir uebernehmen den Pfad ab broadcast-STT + brain.
|
||||||
|
|
||||||
|
Spiegel-Methode zu _process_app_audio NACH dem STT-Schritt. Bewusst
|
||||||
|
eigene Methode statt Code-Pfade in _process_app_audio aufdroeseln,
|
||||||
|
damit der Legacy-Pfad (App schickt 'audio') unangetastet bleibt.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
stt_payload = {
|
||||||
|
"text": text,
|
||||||
|
"sender": "stt",
|
||||||
|
}
|
||||||
|
if audio_request_id:
|
||||||
|
stt_payload["audioRequestId"] = audio_request_id
|
||||||
|
if location:
|
||||||
|
stt_payload["location"] = location
|
||||||
|
ok = await self._send_to_rvs({
|
||||||
|
"type": "chat",
|
||||||
|
"payload": stt_payload,
|
||||||
|
"timestamp": int(asyncio.get_event_loop().time() * 1000),
|
||||||
|
})
|
||||||
|
if ok:
|
||||||
|
logger.info("[rvs] STT-Text (endpoint) broadcastet")
|
||||||
|
else:
|
||||||
|
logger.warning("[rvs] STT-Text (endpoint) NICHT broadcastet")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("[rvs] STT-Text (endpoint) konnte nicht broadcastet werden: %s", e)
|
||||||
|
|
||||||
|
core_text = self._build_core_text(text, interrupted, location)
|
||||||
|
await self.send_to_core(core_text,
|
||||||
|
source="app-voice-stream" + (" [barge-in]" if interrupted else ""),
|
||||||
|
client_msg_id=client_msg_id)
|
||||||
|
|
||||||
async def _stt_remote(self, audio_b64: str, mime_type: str) -> Optional[str]:
|
async def _stt_remote(self, audio_b64: str, mime_type: str) -> Optional[str]:
|
||||||
"""Schickt Audio an die whisper-bridge und wartet auf stt_response.
|
"""Schickt Audio an die whisper-bridge und wartet auf stt_response.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user