feat(speaker-id): Phase 2 — Enrollment-UI (App) + Voice-ID-Section (Diagnostic)

App-Seite: - VoiceIdEnrollment.tsx (neue Komponente, ~370 Zeilen): Status-Karte (loading/unenrolled/enrolled/error), Sample-Recorder mit Countdown (4s fest pro Sample), Liste mit einzelnem Loeschen, Save-Button (disabled bis 5 Samples), Fingerprint-Delete mit Confirm. - SettingsScreen.tsx: neue Section 🎤 'Stimme einrichten' zwischen Wake-Word und Sprachausgabe. - Sample-Format: WAV via audioService.startRecording — wird whisper-bridge-seitig per wave-Modul gestrippt. Diagnostic-Seite: - Neue settings-section 'Voice-ID (Sprecher-Erkennung)': Status-Anzeige (live ueber voice_id_status_response), Threshold-Slider 0.30-0.70 (persistiert in voice_config.json, broadcast als config-Message), Refresh + Delete-Button. - server.js: 2 neue actions (voice_id_status, voice_id_delete), send_voice_config nimmt voiceIdThreshold mit auf. Backend: - speaker_id.py: _normalize_audio_bytes erkennt jetzt WAV-Header (RIFF/WAVE) und strippt auf rohes PCM — sonst werfen die ECAPA- Embeddings auf den 44-Byte-Header rein. - bridge.py: config-Broadcast-Handler setzt voiceIdThreshold auf speaker_id.DEFAULT_THRESHOLD (wird erst in Phase 3 beim Gating genutzt, persistiert aber schon). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-06 20:36:06 +02:00
parent 6e19adab87
commit e3fe27f736
6 changed files with 578 additions and 2 deletions
@@ -764,6 +764,42 @@
      </div>
    </div>

+    <!-- Voice-ID (Sprecher-Erkennung) -->
+    <div class="settings-section">
+      <h2>Voice-ID (Sprecher-Erkennung)</h2>
+      <div style="font-size:11px;color:#8888AA;margin-bottom:8px;">
+        ARIA erkennt Stefans Stimme anhand eines Fingerprints (SpeechBrain ECAPA-TDNN).
+        Andere Sprecher (TV, Hintergrund-Gespraeche) werden gefiltert — keine Brain-
+        Calls, keine Tokens. Enrollment passiert in der App (Settings → Stimme einrichten),
+        weil das Handy-Mikro auch im Betrieb hoert.
+      </div>
+      <div class="card" style="max-width:500px;">
+        <div id="voice-id-status" style="font-size:13px;color:#E0E0F0;margin-bottom:10px;">
+          Status wird geladen...
+        </div>
+        <div style="display:flex;align-items:center;gap:12px;margin-bottom:8px;">
+          <label style="color:#8888AA;font-size:12px;min-width:130px;">Match-Threshold:</label>
+          <input type="range" id="diag-voice-id-threshold" min="0.30" max="0.70" step="0.05" value="0.50"
+                 oninput="document.getElementById('voice-id-threshold-display').textContent = this.value"
+                 onchange="sendVoiceConfig()"
+                 style="flex:1;">
+          <span id="voice-id-threshold-display" style="color:#E0E0F0;font-family:monospace;min-width:40px;text-align:right;">0.50</span>
+        </div>
+        <div style="font-size:10px;color:#555570;margin-bottom:12px;">
+          Niedriger = mehr Treffer auch bei Nebengeraeuschen (false-positives).
+          Hoeher = strenger, kann Stefan auch mal verpassen. 0.50 ist konservativer Default.
+        </div>
+        <div style="display:flex;gap:8px;">
+          <button class="btn secondary" onclick="refreshVoiceIdStatus()" style="padding:6px 14px;font-size:12px;">
+            🔄 Status aktualisieren
+          </button>
+          <button class="btn danger" onclick="deleteVoiceId()" style="padding:6px 14px;font-size:12px;">
+            🗑 Fingerprint löschen
+          </button>
+        </div>
+      </div>
+    </div>
+
    <!-- Runtime-Konfiguration -->
    <div class="settings-section">
      <h2>Runtime-Konfiguration</h2>
@@ -1475,6 +1511,46 @@
          setIfPresent('diag-flux-keyword-raw', msg.fluxKeywordRaw);
          setIfPresent('diag-flux-keyword-switch', msg.fluxKeywordSwitch);
          setIfPresent('diag-flux-hf-token', msg.huggingfaceToken);
+          // Voice-ID-Threshold wiederherstellen (Default 0.50)
+          if (msg.voiceIdThreshold !== undefined && msg.voiceIdThreshold !== null) {
+            const slider = document.getElementById('diag-voice-id-threshold');
+            const display = document.getElementById('voice-id-threshold-display');
+            if (slider) slider.value = msg.voiceIdThreshold;
+            if (display) display.textContent = Number(msg.voiceIdThreshold).toFixed(2);
+          }
+          return;
+        }
+
+        if (msg.type === 'voice_id_status_response') {
+          const el = document.getElementById('voice-id-status');
+          if (!el) return;
+          if (msg.payload && msg.payload.ok === false) {
+            el.innerHTML = '<span style="color:#FF6E6E;">⚠ Whisper-Bridge nicht erreichbar: ' +
+                           (msg.payload.error || 'unbekannt') + '</span>';
+            return;
+          }
+          const p = msg.payload || msg;
+          if (p.enrolled) {
+            const when = p.updated_at ? new Date(p.updated_at * 1000).toLocaleString('de-DE') : '?';
+            const totalSec = (p.sample_durations_s || []).reduce((a, b) => a + b, 0);
+            el.innerHTML = '<span style="color:#34C759;">✓ Enrolled</span> · ' +
+                           p.sample_count + ' Samples (' + totalSec.toFixed(1) + 's) · ' +
+                           'aktualisiert ' + when + ' · dim=' + (p.embedding_dim || '?');
+          } else {
+            el.innerHTML = '<span style="color:#FFD60A;">○ Nicht enrolled</span> — ' +
+                           'in der App unter "Stimme einrichten" 5-10× je 3s aufnehmen.';
+          }
+          return;
+        }
+
+        if (msg.type === 'voice_id_delete_response') {
+          const p = msg.payload || msg;
+          if (p.removed) {
+            alert('Fingerprint gelöscht — Voice-ID-Gating fällt zurück auf Fail-Open.');
+          } else {
+            alert('Es war kein Fingerprint vorhanden.');
+          }
+          refreshVoiceIdStatus();
          return;
        }

@@ -2607,6 +2683,17 @@
      });
    }

+    function refreshVoiceIdStatus() {
+      const el = document.getElementById('voice-id-status');
+      if (el) el.textContent = '⏳ Status wird abgefragt...';
+      send({ action: 'voice_id_status' });
+    }
+
+    function deleteVoiceId() {
+      if (!confirm('Voice-ID-Fingerprint loeschen?\n\nDanach muss in der App neu enrolled werden.')) return;
+      send({ action: 'voice_id_delete' });
+    }
+
    function deleteXttsVoice(name) {
      if (!confirm(`Stimme "${name}" endgueltig loeschen?`)) return;
      send({ action: 'xtts_delete_voice', name });
@@ -2823,12 +2910,15 @@
      const fluxKeywordRaw = document.getElementById('diag-flux-keyword-raw')?.value;
      const fluxKeywordSwitch = document.getElementById('diag-flux-keyword-switch')?.value;
      const huggingfaceToken = document.getElementById('diag-flux-hf-token')?.value;
+      const voiceIdThresholdRaw = document.getElementById('diag-voice-id-threshold')?.value;
+      const voiceIdThreshold = voiceIdThresholdRaw ? parseFloat(voiceIdThresholdRaw) : undefined;
      send({
        action: 'send_voice_config',
        ttsEnabled, xttsVoice, whisperModel,
        f5ttsModel, f5ttsCkptFile, f5ttsVocabFile,
        f5ttsCfgStrength, f5ttsNfeStep,
        fluxDefaultModel, fluxKeywordRaw, fluxKeywordSwitch, huggingfaceToken,
+        voiceIdThreshold,
      });
      const statusEl = document.getElementById('voice-status');
      if (statusEl && xttsVoice) {
@@ -3354,6 +3444,7 @@
        loadRuntimeConfig();
        loadOnboardingQR();
        loadOAuthServices();
+        refreshVoiceIdStatus();
      } else if (tab === 'brain') {
        loadBrainStatus();
        loadBrainMemoryList();
@@ -2367,6 +2367,12 @@ wss.on("connection", (ws) => {
        if (msg.huggingfaceToken !== undefined) {
          voiceConfig.huggingfaceToken = String(msg.huggingfaceToken || "").trim();
        }
+        // Voice-ID Match-Threshold (0.30-0.70). Wird von der whisper-bridge
+        // ueber den config-Broadcast aufgenommen — Phase 3 nutzt's beim Gating.
+        if (msg.voiceIdThreshold !== undefined && !isNaN(msg.voiceIdThreshold)) {
+          const t = parseFloat(msg.voiceIdThreshold);
+          if (t >= 0.0 && t <= 1.0) voiceConfig.voiceIdThreshold = t;
+        }
        try {
          fs.mkdirSync("/shared/config", { recursive: true });
          fs.writeFileSync("/shared/config/voice_config.json", JSON.stringify(voiceConfig, null, 2));
@@ -2390,6 +2396,15 @@ wss.on("connection", (ws) => {
        handleGetModel(ws);
      } else if (msg.action === "set_model") {
        handleSetModel(ws, msg.model);
+      } else if (msg.action === "voice_id_status") {
+        // An whisper-bridge weiterleiten + Antwort an Browser zurueck
+        const reqId = `vid_${Date.now().toString(36)}`;
+        sendToRVS_withResponse("voice_id_status_request", { requestId: reqId },
+                               "voice_id_status_response", ws);
+      } else if (msg.action === "voice_id_delete") {
+        const reqId = `viddel_${Date.now().toString(36)}`;
+        sendToRVS_withResponse("voice_id_delete_request", { requestId: reqId },
+                               "voice_id_delete_response", ws);
      }
      // get_openclaw_config entfernt — aria-core ist raus.
    } catch {}