feat(speaker-id): Phase 1 — SpeechBrain ECAPA-TDNN Backend in whisper-bridge
Speaker-ID-Modul (Hermes-Style „echtes Gespraech ohne Wake-Word"-Vision, Phase 1 von 5). Erkennt Stefans Stimme via 192-dim Embedding + Cosine- Match gegen einen persistierten Fingerprint. Module: - speaker_id.py: lazy-loaded ECAPA-TDNN (HuggingFace), enroll/verify/ status/delete. Fingerprint = L2-normalisierter Mittelwert aus N Enrollment-Samples in /voice-id/fingerprint.json. Fail-open: kein Fingerprint → verify() returnt (True, 0.0). - bridge.py: 3 Message-Handler — voice_id_status_request, voice_id_enroll_request (samples[]: base64 16kHz int16 PCM), voice_id_delete_request. Enrollment laeuft im Executor (Torch blockt sonst die Event-Loop). - Dockerfile: torch 2.3.1 + torchaudio mit CUDA-12.1-Wheels (sonst zieht speechbrain CPU-only Torch rein). Container ~1 GB groesser. - docker-compose.yml: ./voice-id:/voice-id Bind-Mount fuer Fingerprint- Persistenz (ueberlebt Container-Restart). - rvs/server.js: 6 neue Message-Types in ALLOWED_TYPES. Phase 2 (next): App-Enrollment-Flow + Diagnostic-Voice-ID-Section. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -42,6 +42,12 @@ const ALLOWED_TYPES = new Set([
|
||||
// die feuert stt_endpoint mit dem finalen Text — kein Audio-Roundtrip.
|
||||
"stt_stream_start", "stt_audio_chunk", "stt_stream_end",
|
||||
"stt_partial", "stt_endpoint", "stt_stream_done",
|
||||
// Speaker-ID / Voice-Enrollment (Phase 1+2): App schickt 5-10 Samples zur
|
||||
// whisper-bridge, die berechnet einen Voice-Fingerprint (Embedding-Vektor)
|
||||
// und nutzt ihn um nur Stefans Stimme an Whisper STT durchzulassen.
|
||||
"voice_id_status_request", "voice_id_status_response",
|
||||
"voice_id_enroll_request", "voice_id_enroll_response",
|
||||
"voice_id_delete_request", "voice_id_delete_response",
|
||||
// File-Versioning (Datei-Manager in App): Versionen pro Datei listen,
|
||||
// alte Versionen herunterladen, Restore = non-destructive neuer Commit.
|
||||
"file_version_list_request", "file_version_list_response",
|
||||
|
||||
@@ -85,4 +85,7 @@ services:
|
||||
# ein Modell muss nur einmal pro
|
||||
# Maschine geladen werden, kein
|
||||
# Re-Download bei Container-Restart.
|
||||
- ./voice-id:/voice-id # Speaker-ID-Fingerprint (Stefans
|
||||
# Stimm-Embedding) persistent zwischen
|
||||
# Container-Restarts.
|
||||
restart: unless-stopped
|
||||
|
||||
+10
-2
@@ -1,14 +1,22 @@
|
||||
FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip ffmpeg \
|
||||
python3 python3-pip ffmpeg git \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# PyTorch CUDA-Wheels zuerst (sonst zieht speechbrain CPU-only Torch rein
|
||||
# falls f5tts den Cache noch nicht geseedet hat).
|
||||
RUN pip3 install --no-cache-dir torch==2.3.1 torchaudio==2.3.1 \
|
||||
--index-url https://download.pytorch.org/whl/cu121
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY bridge.py .
|
||||
COPY bridge.py speaker_id.py ./
|
||||
|
||||
CMD ["python3", "bridge.py"]
|
||||
|
||||
@@ -33,6 +33,8 @@ import sys
|
||||
import tempfile
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import speaker_id
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
@@ -729,6 +731,52 @@ async def run_loop(runner: WhisperRunner, sessions: SessionManager) -> None:
|
||||
f"received id={req_id[:12]} reason={payload.get('reason', '')}")
|
||||
sessions.end_session(req_id)
|
||||
|
||||
elif mtype == "voice_id_status_request":
|
||||
req_id = payload.get("requestId", "")
|
||||
try:
|
||||
status = speaker_id.status()
|
||||
except Exception as exc:
|
||||
await _send(ws, "voice_id_status_response", {
|
||||
"requestId": req_id, "ok": False, "error": str(exc)[:200],
|
||||
})
|
||||
continue
|
||||
await _send(ws, "voice_id_status_response", {
|
||||
"requestId": req_id, "ok": True, **status,
|
||||
})
|
||||
|
||||
elif mtype == "voice_id_enroll_request":
|
||||
# samples: Liste von base64-kodierten int16-LE-PCM-Buffern,
|
||||
# 16kHz mono, je ~3-5s. App nimmt sie nacheinander auf und
|
||||
# schickt sie zusammen.
|
||||
req_id = payload.get("requestId", "")
|
||||
samples = payload.get("samples") or []
|
||||
logger.info("voice_id_enroll_request: %d Samples (id=%s)",
|
||||
len(samples), req_id[:8])
|
||||
try:
|
||||
result = await asyncio.get_running_loop().run_in_executor(
|
||||
None, speaker_id.enroll_from_samples, samples
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("voice_id_enroll failed: %s", exc)
|
||||
await _send(ws, "voice_id_enroll_response", {
|
||||
"requestId": req_id, "ok": False, "error": str(exc)[:300],
|
||||
})
|
||||
continue
|
||||
await _send(ws, "voice_id_enroll_response", {
|
||||
"requestId": req_id, "ok": True,
|
||||
"sample_count": result.get("sample_count", 0),
|
||||
"rejected": result.get("rejected", []),
|
||||
"updated_at": result.get("updated_at"),
|
||||
"embedding_dim": result.get("embedding_dim"),
|
||||
})
|
||||
|
||||
elif mtype == "voice_id_delete_request":
|
||||
req_id = payload.get("requestId", "")
|
||||
removed = speaker_id.delete_fingerprint()
|
||||
await _send(ws, "voice_id_delete_response", {
|
||||
"requestId": req_id, "ok": True, "removed": removed,
|
||||
})
|
||||
|
||||
elif mtype == "config":
|
||||
# Debug-Toggle: aria-bridge broadcastet jetzt whisperDebugLog
|
||||
# damit Stefan im laufenden Betrieb via Diagnostic-Settings
|
||||
|
||||
@@ -2,3 +2,6 @@ faster-whisper==1.0.3
|
||||
websockets>=12.0
|
||||
numpy>=1.24
|
||||
requests>=2.31
|
||||
# Speaker-ID via SpeechBrain ECAPA-TDNN — Stimme von Stefan zuverlaessig
|
||||
# rauskennen damit Hintergrund-Gespraeche keine Brain-Calls triggern.
|
||||
speechbrain>=1.0.0
|
||||
|
||||
@@ -0,0 +1,200 @@
|
||||
"""
|
||||
Speaker-ID Backend fuer ARIAs Stimmen-Erkennung.
|
||||
|
||||
Nutzt SpeechBrain ECAPA-TDNN (192-dim Embeddings, auf VoxCeleb-1+2 trainiert).
|
||||
Fingerprint = gemittelter, L2-normalisierter Embedding-Vektor aus N
|
||||
Enrollment-Samples. Verify: cosine_similarity(neue_aufnahme, fingerprint).
|
||||
|
||||
Persistenz: /voice-id/fingerprint.json (Float-Liste + Metadaten).
|
||||
Modell-Cache: /root/.cache/huggingface/ (Bind-Mount mit f5tts geteilt).
|
||||
|
||||
Verhalten OHNE Enrollment (kein Fingerprint vorhanden):
|
||||
verify() → (True, 0.0) — Fail-open, damit Speaker-ID-Gating den
|
||||
ungeenrollten Brain-Pfad nicht versehentlich blockiert.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
VOICE_ID_DIR = Path(os.environ.get("VOICE_ID_DIR", "/voice-id"))
|
||||
FINGERPRINT_FILE = VOICE_ID_DIR / "fingerprint.json"
|
||||
|
||||
# Cosine-Threshold: 0.5 ist konservativ (wenig false-positives), 0.3 ist
|
||||
# locker (mehr Treffer auch bei Nebengeraeuschen). Stefan kann's per
|
||||
# Diagnostic-Setting feintunen.
|
||||
DEFAULT_THRESHOLD = 0.5
|
||||
|
||||
# Minimal-Sample-Laenge fuer ein verlaessliches Embedding (~1s @ 16kHz int16 = 32000 bytes)
|
||||
MIN_SAMPLE_BYTES = 32000
|
||||
|
||||
_model = None
|
||||
|
||||
|
||||
def _ensure_loaded():
|
||||
"""Lazy-Load des ECAPA-TDNN. Holt das Modell beim ersten Aufruf von HF;
|
||||
danach cached im HF-Cache-Volume. Erste Init: ~30s download + load,
|
||||
danach <1s warm. Wirft bei Fehler — Caller muss catchen + fail-open."""
|
||||
global _model
|
||||
if _model is not None:
|
||||
return _model
|
||||
import torch
|
||||
from speechbrain.inference.speaker import EncoderClassifier
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
logger.info("[speaker-id] loading ECAPA-TDNN on %s ...", device)
|
||||
_model = EncoderClassifier.from_hparams(
|
||||
source="speechbrain/spkrec-ecapa-voxceleb",
|
||||
savedir="/root/.cache/huggingface/speechbrain-ecapa",
|
||||
run_opts={"device": device},
|
||||
)
|
||||
logger.info("[speaker-id] model ready (device=%s)", device)
|
||||
return _model
|
||||
|
||||
|
||||
def _audio_bytes_to_tensor(audio_bytes: bytes):
|
||||
"""int16 LE PCM (16kHz mono) → Torch-Tensor (1, N), normalisiert auf [-1, 1]."""
|
||||
import torch
|
||||
arr = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
return torch.from_numpy(arr).unsqueeze(0)
|
||||
|
||||
|
||||
def embed(audio_bytes: bytes) -> np.ndarray:
|
||||
"""Berechnet das Speaker-Embedding fuer einen Audio-Chunk.
|
||||
Erwartet 16kHz int16 LE PCM Mono. Returns 192-dim numpy float32."""
|
||||
import torch
|
||||
model = _ensure_loaded()
|
||||
wav = _audio_bytes_to_tensor(audio_bytes)
|
||||
with torch.no_grad():
|
||||
emb = model.encode_batch(wav)
|
||||
return emb.squeeze().cpu().numpy().astype(np.float32)
|
||||
|
||||
|
||||
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
||||
"""Kosinus-Aehnlichkeit zwischen zwei 1D-Vektoren, Range [-1, 1].
|
||||
Hoeher = aehnlicher. Bei normalisierten Vektoren ist das gleich dem Skalarprodukt."""
|
||||
na = np.linalg.norm(a)
|
||||
nb = np.linalg.norm(b)
|
||||
if na < 1e-9 or nb < 1e-9:
|
||||
return 0.0
|
||||
return float(np.dot(a, b) / (na * nb))
|
||||
|
||||
|
||||
def save_fingerprint(embeddings: list[np.ndarray], sample_durations_s: list[float]) -> dict:
|
||||
"""Mittelt + L2-normalisiert die Embeddings und schreibt sie nach
|
||||
FINGERPRINT_FILE. Returns das gespeicherte Dict."""
|
||||
if not embeddings:
|
||||
raise ValueError("Keine Embeddings zum Speichern")
|
||||
VOICE_ID_DIR.mkdir(parents=True, exist_ok=True)
|
||||
stacked = np.stack(embeddings)
|
||||
mean = stacked.mean(axis=0)
|
||||
mean = mean / max(np.linalg.norm(mean), 1e-9)
|
||||
data = {
|
||||
"version": 1,
|
||||
"embedding": mean.tolist(),
|
||||
"embedding_dim": int(mean.shape[0]),
|
||||
"sample_count": len(embeddings),
|
||||
"sample_durations_s": [float(s) for s in sample_durations_s],
|
||||
"updated_at": int(time.time()),
|
||||
}
|
||||
FINGERPRINT_FILE.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
||||
logger.info("[speaker-id] fingerprint gespeichert: %d Samples, dim=%d, total_s=%.1f",
|
||||
len(embeddings), mean.shape[0], sum(sample_durations_s))
|
||||
return data
|
||||
|
||||
|
||||
def load_fingerprint() -> Optional[dict]:
|
||||
"""Returns das Fingerprint-Dict oder None wenn noch nicht enrolled."""
|
||||
if not FINGERPRINT_FILE.exists():
|
||||
return None
|
||||
try:
|
||||
return json.loads(FINGERPRINT_FILE.read_text(encoding="utf-8"))
|
||||
except Exception as exc:
|
||||
logger.warning("[speaker-id] fingerprint laden fehlgeschlagen: %s", exc)
|
||||
return None
|
||||
|
||||
|
||||
def delete_fingerprint() -> bool:
|
||||
"""Loescht den Fingerprint (z.B. fuer Re-Enrollment). True wenn was weg ist."""
|
||||
if FINGERPRINT_FILE.exists():
|
||||
FINGERPRINT_FILE.unlink()
|
||||
logger.info("[speaker-id] fingerprint geloescht")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def verify(audio_bytes: bytes, threshold: float = DEFAULT_THRESHOLD) -> tuple[bool, float]:
|
||||
"""Returns (is_match, similarity).
|
||||
|
||||
Fail-open: wenn kein Fingerprint vorhanden ist oder das Embedding-Modell
|
||||
crasht, returnt (True, 0.0) — kein Filtering. Sonst wuerde ein kaputter
|
||||
Speaker-ID-Service die ganze Aufnahme blockieren."""
|
||||
fp = load_fingerprint()
|
||||
if fp is None:
|
||||
return True, 0.0
|
||||
if len(audio_bytes) < MIN_SAMPLE_BYTES:
|
||||
# Zu wenig Audio fuer ein verlaessliches Embedding → durchlassen
|
||||
return True, 0.0
|
||||
try:
|
||||
saved_emb = np.array(fp["embedding"], dtype=np.float32)
|
||||
new_emb = embed(audio_bytes)
|
||||
except Exception as exc:
|
||||
logger.warning("[speaker-id] verify embed failed: %s — fail-open", exc)
|
||||
return True, 0.0
|
||||
sim = cosine_similarity(new_emb, saved_emb)
|
||||
return sim >= threshold, sim
|
||||
|
||||
|
||||
def status() -> dict:
|
||||
"""Status-Snapshot fuer die App / Diagnostic."""
|
||||
fp = load_fingerprint()
|
||||
return {
|
||||
"enrolled": fp is not None,
|
||||
"sample_count": fp.get("sample_count", 0) if fp else 0,
|
||||
"sample_durations_s": fp.get("sample_durations_s", []) if fp else [],
|
||||
"updated_at": fp.get("updated_at") if fp else None,
|
||||
"embedding_dim": fp.get("embedding_dim") if fp else None,
|
||||
"default_threshold": DEFAULT_THRESHOLD,
|
||||
}
|
||||
|
||||
|
||||
def enroll_from_samples(samples_b64: list[str]) -> dict:
|
||||
"""Verarbeitet base64-Samples (16kHz int16 LE PCM Mono) zu einem neuen
|
||||
Fingerprint. Returns Status-Dict. Wirft ValueError wenn nichts brauchbar ist."""
|
||||
if not samples_b64:
|
||||
raise ValueError("Keine Samples uebergeben")
|
||||
embeddings: list[np.ndarray] = []
|
||||
durations: list[float] = []
|
||||
rejected: list[dict] = []
|
||||
for idx, s in enumerate(samples_b64):
|
||||
try:
|
||||
raw = base64.b64decode(s)
|
||||
except Exception as exc:
|
||||
rejected.append({"index": idx, "reason": f"base64: {exc}"})
|
||||
continue
|
||||
if len(raw) < MIN_SAMPLE_BYTES:
|
||||
rejected.append({"index": idx, "reason": f"zu kurz ({len(raw)} bytes)"})
|
||||
continue
|
||||
try:
|
||||
emb = embed(raw)
|
||||
embeddings.append(emb)
|
||||
durations.append(len(raw) / 2 / 16000.0)
|
||||
except Exception as exc:
|
||||
rejected.append({"index": idx, "reason": f"embed: {exc}"})
|
||||
if not embeddings:
|
||||
raise ValueError(
|
||||
f"Keine Samples konnten verarbeitet werden ({len(rejected)} rejected). "
|
||||
f"Details: {rejected[:3]}"
|
||||
)
|
||||
fingerprint = save_fingerprint(embeddings, durations)
|
||||
fingerprint["rejected"] = rejected
|
||||
return fingerprint
|
||||
Reference in New Issue
Block a user