diff --git a/rvs/server.js b/rvs/server.js index 2c26d5f..a94b762 100644 --- a/rvs/server.js +++ b/rvs/server.js @@ -42,6 +42,12 @@ const ALLOWED_TYPES = new Set([ // die feuert stt_endpoint mit dem finalen Text — kein Audio-Roundtrip. "stt_stream_start", "stt_audio_chunk", "stt_stream_end", "stt_partial", "stt_endpoint", "stt_stream_done", + // Speaker-ID / Voice-Enrollment (Phase 1+2): App schickt 5-10 Samples zur + // whisper-bridge, die berechnet einen Voice-Fingerprint (Embedding-Vektor) + // und nutzt ihn um nur Stefans Stimme an Whisper STT durchzulassen. + "voice_id_status_request", "voice_id_status_response", + "voice_id_enroll_request", "voice_id_enroll_response", + "voice_id_delete_request", "voice_id_delete_response", // File-Versioning (Datei-Manager in App): Versionen pro Datei listen, // alte Versionen herunterladen, Restore = non-destructive neuer Commit. "file_version_list_request", "file_version_list_response", diff --git a/xtts/docker-compose.yml b/xtts/docker-compose.yml index ea5ab5f..501ff60 100644 --- a/xtts/docker-compose.yml +++ b/xtts/docker-compose.yml @@ -85,4 +85,7 @@ services: # ein Modell muss nur einmal pro # Maschine geladen werden, kein # Re-Download bei Container-Restart. + - ./voice-id:/voice-id # Speaker-ID-Fingerprint (Stefans + # Stimm-Embedding) persistent zwischen + # Container-Restarts. restart: unless-stopped diff --git a/xtts/whisper/Dockerfile b/xtts/whisper/Dockerfile index 7a55c56..da0ad6b 100644 --- a/xtts/whisper/Dockerfile +++ b/xtts/whisper/Dockerfile @@ -1,14 +1,22 @@ FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04 +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 + RUN apt-get update && apt-get install -y --no-install-recommends \ - python3 python3-pip ffmpeg \ + python3 python3-pip ffmpeg git \ && rm -rf /var/lib/apt/lists/* WORKDIR /app +# PyTorch CUDA-Wheels zuerst (sonst zieht speechbrain CPU-only Torch rein +# falls f5tts den Cache noch nicht geseedet hat). +RUN pip3 install --no-cache-dir torch==2.3.1 torchaudio==2.3.1 \ + --index-url https://download.pytorch.org/whl/cu121 + COPY requirements.txt . RUN pip3 install --no-cache-dir -r requirements.txt -COPY bridge.py . +COPY bridge.py speaker_id.py ./ CMD ["python3", "bridge.py"] diff --git a/xtts/whisper/bridge.py b/xtts/whisper/bridge.py index 7ec3e73..e79e281 100644 --- a/xtts/whisper/bridge.py +++ b/xtts/whisper/bridge.py @@ -33,6 +33,8 @@ import sys import tempfile import time from dataclasses import dataclass, field + +import speaker_id from typing import Optional import numpy as np @@ -729,6 +731,52 @@ async def run_loop(runner: WhisperRunner, sessions: SessionManager) -> None: f"received id={req_id[:12]} reason={payload.get('reason', '')}") sessions.end_session(req_id) + elif mtype == "voice_id_status_request": + req_id = payload.get("requestId", "") + try: + status = speaker_id.status() + except Exception as exc: + await _send(ws, "voice_id_status_response", { + "requestId": req_id, "ok": False, "error": str(exc)[:200], + }) + continue + await _send(ws, "voice_id_status_response", { + "requestId": req_id, "ok": True, **status, + }) + + elif mtype == "voice_id_enroll_request": + # samples: Liste von base64-kodierten int16-LE-PCM-Buffern, + # 16kHz mono, je ~3-5s. App nimmt sie nacheinander auf und + # schickt sie zusammen. + req_id = payload.get("requestId", "") + samples = payload.get("samples") or [] + logger.info("voice_id_enroll_request: %d Samples (id=%s)", + len(samples), req_id[:8]) + try: + result = await asyncio.get_running_loop().run_in_executor( + None, speaker_id.enroll_from_samples, samples + ) + except Exception as exc: + logger.warning("voice_id_enroll failed: %s", exc) + await _send(ws, "voice_id_enroll_response", { + "requestId": req_id, "ok": False, "error": str(exc)[:300], + }) + continue + await _send(ws, "voice_id_enroll_response", { + "requestId": req_id, "ok": True, + "sample_count": result.get("sample_count", 0), + "rejected": result.get("rejected", []), + "updated_at": result.get("updated_at"), + "embedding_dim": result.get("embedding_dim"), + }) + + elif mtype == "voice_id_delete_request": + req_id = payload.get("requestId", "") + removed = speaker_id.delete_fingerprint() + await _send(ws, "voice_id_delete_response", { + "requestId": req_id, "ok": True, "removed": removed, + }) + elif mtype == "config": # Debug-Toggle: aria-bridge broadcastet jetzt whisperDebugLog # damit Stefan im laufenden Betrieb via Diagnostic-Settings diff --git a/xtts/whisper/requirements.txt b/xtts/whisper/requirements.txt index 2d88b24..f693b8a 100644 --- a/xtts/whisper/requirements.txt +++ b/xtts/whisper/requirements.txt @@ -2,3 +2,6 @@ faster-whisper==1.0.3 websockets>=12.0 numpy>=1.24 requests>=2.31 +# Speaker-ID via SpeechBrain ECAPA-TDNN — Stimme von Stefan zuverlaessig +# rauskennen damit Hintergrund-Gespraeche keine Brain-Calls triggern. +speechbrain>=1.0.0 diff --git a/xtts/whisper/speaker_id.py b/xtts/whisper/speaker_id.py new file mode 100644 index 0000000..571ad7e --- /dev/null +++ b/xtts/whisper/speaker_id.py @@ -0,0 +1,200 @@ +""" +Speaker-ID Backend fuer ARIAs Stimmen-Erkennung. + +Nutzt SpeechBrain ECAPA-TDNN (192-dim Embeddings, auf VoxCeleb-1+2 trainiert). +Fingerprint = gemittelter, L2-normalisierter Embedding-Vektor aus N +Enrollment-Samples. Verify: cosine_similarity(neue_aufnahme, fingerprint). + +Persistenz: /voice-id/fingerprint.json (Float-Liste + Metadaten). +Modell-Cache: /root/.cache/huggingface/ (Bind-Mount mit f5tts geteilt). + +Verhalten OHNE Enrollment (kein Fingerprint vorhanden): + verify() → (True, 0.0) — Fail-open, damit Speaker-ID-Gating den + ungeenrollten Brain-Pfad nicht versehentlich blockiert. +""" + +from __future__ import annotations + +import base64 +import json +import logging +import os +import time +from pathlib import Path +from typing import Optional + +import numpy as np + +logger = logging.getLogger(__name__) + +VOICE_ID_DIR = Path(os.environ.get("VOICE_ID_DIR", "/voice-id")) +FINGERPRINT_FILE = VOICE_ID_DIR / "fingerprint.json" + +# Cosine-Threshold: 0.5 ist konservativ (wenig false-positives), 0.3 ist +# locker (mehr Treffer auch bei Nebengeraeuschen). Stefan kann's per +# Diagnostic-Setting feintunen. +DEFAULT_THRESHOLD = 0.5 + +# Minimal-Sample-Laenge fuer ein verlaessliches Embedding (~1s @ 16kHz int16 = 32000 bytes) +MIN_SAMPLE_BYTES = 32000 + +_model = None + + +def _ensure_loaded(): + """Lazy-Load des ECAPA-TDNN. Holt das Modell beim ersten Aufruf von HF; + danach cached im HF-Cache-Volume. Erste Init: ~30s download + load, + danach <1s warm. Wirft bei Fehler — Caller muss catchen + fail-open.""" + global _model + if _model is not None: + return _model + import torch + from speechbrain.inference.speaker import EncoderClassifier + device = "cuda" if torch.cuda.is_available() else "cpu" + logger.info("[speaker-id] loading ECAPA-TDNN on %s ...", device) + _model = EncoderClassifier.from_hparams( + source="speechbrain/spkrec-ecapa-voxceleb", + savedir="/root/.cache/huggingface/speechbrain-ecapa", + run_opts={"device": device}, + ) + logger.info("[speaker-id] model ready (device=%s)", device) + return _model + + +def _audio_bytes_to_tensor(audio_bytes: bytes): + """int16 LE PCM (16kHz mono) → Torch-Tensor (1, N), normalisiert auf [-1, 1].""" + import torch + arr = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + return torch.from_numpy(arr).unsqueeze(0) + + +def embed(audio_bytes: bytes) -> np.ndarray: + """Berechnet das Speaker-Embedding fuer einen Audio-Chunk. + Erwartet 16kHz int16 LE PCM Mono. Returns 192-dim numpy float32.""" + import torch + model = _ensure_loaded() + wav = _audio_bytes_to_tensor(audio_bytes) + with torch.no_grad(): + emb = model.encode_batch(wav) + return emb.squeeze().cpu().numpy().astype(np.float32) + + +def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: + """Kosinus-Aehnlichkeit zwischen zwei 1D-Vektoren, Range [-1, 1]. + Hoeher = aehnlicher. Bei normalisierten Vektoren ist das gleich dem Skalarprodukt.""" + na = np.linalg.norm(a) + nb = np.linalg.norm(b) + if na < 1e-9 or nb < 1e-9: + return 0.0 + return float(np.dot(a, b) / (na * nb)) + + +def save_fingerprint(embeddings: list[np.ndarray], sample_durations_s: list[float]) -> dict: + """Mittelt + L2-normalisiert die Embeddings und schreibt sie nach + FINGERPRINT_FILE. Returns das gespeicherte Dict.""" + if not embeddings: + raise ValueError("Keine Embeddings zum Speichern") + VOICE_ID_DIR.mkdir(parents=True, exist_ok=True) + stacked = np.stack(embeddings) + mean = stacked.mean(axis=0) + mean = mean / max(np.linalg.norm(mean), 1e-9) + data = { + "version": 1, + "embedding": mean.tolist(), + "embedding_dim": int(mean.shape[0]), + "sample_count": len(embeddings), + "sample_durations_s": [float(s) for s in sample_durations_s], + "updated_at": int(time.time()), + } + FINGERPRINT_FILE.write_text(json.dumps(data, indent=2), encoding="utf-8") + logger.info("[speaker-id] fingerprint gespeichert: %d Samples, dim=%d, total_s=%.1f", + len(embeddings), mean.shape[0], sum(sample_durations_s)) + return data + + +def load_fingerprint() -> Optional[dict]: + """Returns das Fingerprint-Dict oder None wenn noch nicht enrolled.""" + if not FINGERPRINT_FILE.exists(): + return None + try: + return json.loads(FINGERPRINT_FILE.read_text(encoding="utf-8")) + except Exception as exc: + logger.warning("[speaker-id] fingerprint laden fehlgeschlagen: %s", exc) + return None + + +def delete_fingerprint() -> bool: + """Loescht den Fingerprint (z.B. fuer Re-Enrollment). True wenn was weg ist.""" + if FINGERPRINT_FILE.exists(): + FINGERPRINT_FILE.unlink() + logger.info("[speaker-id] fingerprint geloescht") + return True + return False + + +def verify(audio_bytes: bytes, threshold: float = DEFAULT_THRESHOLD) -> tuple[bool, float]: + """Returns (is_match, similarity). + + Fail-open: wenn kein Fingerprint vorhanden ist oder das Embedding-Modell + crasht, returnt (True, 0.0) — kein Filtering. Sonst wuerde ein kaputter + Speaker-ID-Service die ganze Aufnahme blockieren.""" + fp = load_fingerprint() + if fp is None: + return True, 0.0 + if len(audio_bytes) < MIN_SAMPLE_BYTES: + # Zu wenig Audio fuer ein verlaessliches Embedding → durchlassen + return True, 0.0 + try: + saved_emb = np.array(fp["embedding"], dtype=np.float32) + new_emb = embed(audio_bytes) + except Exception as exc: + logger.warning("[speaker-id] verify embed failed: %s — fail-open", exc) + return True, 0.0 + sim = cosine_similarity(new_emb, saved_emb) + return sim >= threshold, sim + + +def status() -> dict: + """Status-Snapshot fuer die App / Diagnostic.""" + fp = load_fingerprint() + return { + "enrolled": fp is not None, + "sample_count": fp.get("sample_count", 0) if fp else 0, + "sample_durations_s": fp.get("sample_durations_s", []) if fp else [], + "updated_at": fp.get("updated_at") if fp else None, + "embedding_dim": fp.get("embedding_dim") if fp else None, + "default_threshold": DEFAULT_THRESHOLD, + } + + +def enroll_from_samples(samples_b64: list[str]) -> dict: + """Verarbeitet base64-Samples (16kHz int16 LE PCM Mono) zu einem neuen + Fingerprint. Returns Status-Dict. Wirft ValueError wenn nichts brauchbar ist.""" + if not samples_b64: + raise ValueError("Keine Samples uebergeben") + embeddings: list[np.ndarray] = [] + durations: list[float] = [] + rejected: list[dict] = [] + for idx, s in enumerate(samples_b64): + try: + raw = base64.b64decode(s) + except Exception as exc: + rejected.append({"index": idx, "reason": f"base64: {exc}"}) + continue + if len(raw) < MIN_SAMPLE_BYTES: + rejected.append({"index": idx, "reason": f"zu kurz ({len(raw)} bytes)"}) + continue + try: + emb = embed(raw) + embeddings.append(emb) + durations.append(len(raw) / 2 / 16000.0) + except Exception as exc: + rejected.append({"index": idx, "reason": f"embed: {exc}"}) + if not embeddings: + raise ValueError( + f"Keine Samples konnten verarbeitet werden ({len(rejected)} rejected). " + f"Details: {rejected[:3]}" + ) + fingerprint = save_fingerprint(embeddings, durations) + fingerprint["rejected"] = rejected + return fingerprint