diff --git a/xtts/whisper/bridge.py b/xtts/whisper/bridge.py index d1be158..c96843e 100644 --- a/xtts/whisper/bridge.py +++ b/xtts/whisper/bridge.py @@ -63,6 +63,7 @@ ALLOWED_MODELS = {"tiny", "base", "small", "medium", "large-v3"} # Streaming-Parameter (Defaults — koennen pro Session vom App-Payload ueberschrieben werden) STREAM_TRANSCRIBE_INTERVAL_MS = 700 # alle 700ms transkribieren waehrend Stream laeuft +STREAM_SPEAKER_CHECK_MS = 1500 # Mindest-Audio fuer Speaker-ID-Pruefung STREAM_DEFAULT_ENDPOINT_MS = 1500 # nach 1.5s ohne neuen Text → Endpoint STREAM_DEFAULT_HARD_CAP_MS = 60000 # nach 60s Audio: harter Cut egal was STREAM_MIN_AUDIO_MS = 600 # erst transkribieren wenn min 600ms Audio da @@ -311,6 +312,12 @@ class StreamSession: last_transcribe_at: float = 0.0 closed: bool = False # nach stream_end gesetzt endpoint_sent: bool = False # Endpoint nur einmal feuern + # Speaker-ID Gating: bei aktiviertem Fingerprint pruefen wir die ersten + # ~1.5s der Aufnahme. Bei mismatch wird die Session sofort beendet mit + # synthetischem stt_endpoint(text='', reason='speaker_mismatch'). + speaker_checked: bool = False + speaker_match: Optional[bool] = None + speaker_similarity: float = 0.0 class SessionManager: @@ -422,6 +429,77 @@ class SessionManager: sid[:8], now - sess.last_chunk_at) self.drop(sid) + async def _check_speaker(self, sess: StreamSession, ws) -> None: + """Speaker-ID einmalig pro Session: nimmt die ersten ~1.5s Audio, + rechnet das Embedding, vergleicht mit dem persistierten Fingerprint. + Ohne Fingerprint → fail-open (match=True). Bei mismatch wird die + Session sofort beendet mit synthetischem stt_endpoint.""" + sess.speaker_checked = True + # Erste ~1.5s aus dem Buffer entnehmen (16kHz * 2 byte/sample = 32 bytes/ms) + head_bytes = bytes(sess.pcm_buffer[: STREAM_SPEAKER_CHECK_MS * 32]) + if len(head_bytes) < speaker_id.MIN_SAMPLE_BYTES: + # Zu wenig — durchlassen + sess.speaker_match = True + sess.speaker_similarity = 0.0 + return + try: + loop = asyncio.get_running_loop() + is_match, sim = await loop.run_in_executor( + None, speaker_id.verify, head_bytes, + ) + except Exception as exc: + logger.warning("Stream %s: speaker-check crashed (%s) — fail-open", + sess.request_id[:8], exc) + sess.speaker_match = True + sess.speaker_similarity = 0.0 + return + sess.speaker_match = is_match + sess.speaker_similarity = sim + logger.info("Stream %s: speaker-check sim=%.2f → %s (threshold=%.2f)", + sess.request_id[:8], sim, "MATCH" if is_match else "REJECT", + speaker_id.DEFAULT_THRESHOLD) + await _debug_log(ws, "speaker.check", + f"id={sess.request_id[:12]} sim={sim:.2f} " + f"thr={speaker_id.DEFAULT_THRESHOLD:.2f} " + f"{'MATCH' if is_match else 'REJECT'}") + if not is_match: + await self._finalize_speaker_mismatch(sess, ws, sim) + + async def _finalize_speaker_mismatch(self, sess: StreamSession, ws, + similarity: float) -> None: + """Bei Speaker-Mismatch: synthetisches stt_endpoint (text='', reason= + 'speaker_mismatch') schicken damit der App-Pfad sauber endet + (endConversation), Session droppen. Kein Whisper-Transcribe. + Spart die Token + die STT-Latenz fuer fremde Stimmen.""" + if sess.endpoint_sent: + return + sess.endpoint_sent = True + duration_s = self._buffer_duration_ms(sess) / 1000.0 + logger.info("Stream %s: speaker-mismatch (sim=%.2f) — DROP nach %.1fs", + sess.request_id[:8], similarity, duration_s) + endpoint_payload = { + "requestId": sess.request_id, + "audioRequestId": sess.audio_request_id, + "text": "", + "reason": "speaker_mismatch", + "durationS": duration_s, + "sttMs": 0, + "voice": sess.voice, + "speed": sess.speed, + "interrupted": sess.interrupted, + "speakerSimilarity": float(similarity), + } + if sess.location: + endpoint_payload["location"] = sess.location + await _send(ws, "stt_endpoint", endpoint_payload) + await _send(ws, "stt_stream_done", { + "requestId": sess.request_id, + "audioRequestId": sess.audio_request_id, + "text": "", + "reason": "speaker_mismatch", + }) + self.drop(sess.request_id) + async def _tick_session(self, sess: StreamSession, now: float) -> None: ws = self._ws if ws is None: @@ -442,6 +520,15 @@ class SessionManager: await self._finalize(sess, ws, reason="stream_end") return + # Speaker-ID Gating: sobald genug Audio da ist, einmalig pruefen ob's + # Stefan ist. Bei Mismatch → synthetisches Endpoint, Session zu. + # Wenn kein Fingerprint persistiert ist, returnt verify() fail-open + # mit (True, 0.0) — keine Auswirkung. + if not sess.speaker_checked and audio_ms >= STREAM_SPEAKER_CHECK_MS: + await self._check_speaker(sess, ws) + if sess.speaker_match is False: + return # Session bereits beendet via _finalize_speaker_mismatch + # Noch zu wenig Audio fuer eine erste Transkription if audio_ms < STREAM_MIN_AUDIO_MS: return diff --git a/xtts/whisper/speaker_id.py b/xtts/whisper/speaker_id.py index 4799599..7ed82f2 100644 --- a/xtts/whisper/speaker_id.py +++ b/xtts/whisper/speaker_id.py @@ -157,12 +157,18 @@ def delete_fingerprint() -> bool: return False -def verify(audio_bytes: bytes, threshold: float = DEFAULT_THRESHOLD) -> tuple[bool, float]: +def verify(audio_bytes: bytes, threshold: Optional[float] = None) -> tuple[bool, float]: """Returns (is_match, similarity). + Wenn threshold=None: nutzt den Modul-Default (DEFAULT_THRESHOLD) — der wird + vom config-Broadcast zur Laufzeit auf den Diagnostic-Slider-Wert gesetzt. + Default-Arg-Bindung waere zur Def-Zeit, also bewusst None statt direkt. + Fail-open: wenn kein Fingerprint vorhanden ist oder das Embedding-Modell crasht, returnt (True, 0.0) — kein Filtering. Sonst wuerde ein kaputter Speaker-ID-Service die ganze Aufnahme blockieren.""" + if threshold is None: + threshold = DEFAULT_THRESHOLD fp = load_fingerprint() if fp is None: return True, 0.0