feat(speaker-id): Phase 2 — Enrollment-UI (App) + Voice-ID-Section (Diagnostic)

App-Seite: - VoiceIdEnrollment.tsx (neue Komponente, ~370 Zeilen): Status-Karte (loading/unenrolled/enrolled/error), Sample-Recorder mit Countdown (4s fest pro Sample), Liste mit einzelnem Loeschen, Save-Button (disabled bis 5 Samples), Fingerprint-Delete mit Confirm. - SettingsScreen.tsx: neue Section 🎤 'Stimme einrichten' zwischen Wake-Word und Sprachausgabe. - Sample-Format: WAV via audioService.startRecording — wird whisper-bridge-seitig per wave-Modul gestrippt. Diagnostic-Seite: - Neue settings-section 'Voice-ID (Sprecher-Erkennung)': Status-Anzeige (live ueber voice_id_status_response), Threshold-Slider 0.30-0.70 (persistiert in voice_config.json, broadcast als config-Message), Refresh + Delete-Button. - server.js: 2 neue actions (voice_id_status, voice_id_delete), send_voice_config nimmt voiceIdThreshold mit auf. Backend: - speaker_id.py: _normalize_audio_bytes erkennt jetzt WAV-Header (RIFF/WAVE) und strippt auf rohes PCM — sonst werfen die ECAPA- Embeddings auf den 44-Byte-Header rein. - bridge.py: config-Broadcast-Handler setzt voiceIdThreshold auf speaker_id.DEFAULT_THRESHOLD (wird erst in Phase 3 beim Gating genutzt, persistiert aber schon). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-06 20:36:06 +02:00
parent 6e19adab87
commit e3fe27f736
6 changed files with 578 additions and 2 deletions
@@ -0,0 +1,426 @@
 /**
 * Voice-ID Enrollment + Status — App-seitig.
 *
 * User nimmt 5-7 Samples (je 4s) seiner Stimme auf, App schickt sie an
 * die whisper-bridge via RVS (voice_id_enroll_request). Bridge berechnet
 * SpeechBrain-ECAPA-Embeddings, mittelt sie zu einem Fingerprint, speichert
 * /voice-id/fingerprint.json.
 *
 * Verwendung: in SettingsScreen für Section 'voice_id' eingebunden.
 * Holt Status bei Mount + nach jedem Enroll/Delete neu ab.
 */
 import React, { useCallback, useEffect, useState } from 'react';
 import {
  ActivityIndicator,
  Alert,
  ScrollView,
  StyleSheet,
  Text,
  ToastAndroid,
  TouchableOpacity,
  View,
 } from 'react-native';
 import audioService from '../services/audio';
 import rvs from '../services/rvs';
 const SAMPLE_DURATION_MS = 4000;      // Pro Sample 4s aufnehmen
 const SAMPLES_REQUIRED = 5;           // Mindest-Sampleanzahl fuer Save
 type Sample = {
  base64: string;
  durationMs: number;
 };
 type Status =
  | { state: 'loading' }
  | { state: 'unenrolled' }
  | { state: 'enrolled'; sampleCount: number; durations: number[]; updatedAt: number; dim: number }
  | { state: 'error'; message: string };
 function _newReqId(prefix: string): string {
  return `${prefix}_${Date.now().toString(36)}_${Math.floor(Math.random() * 1e6).toString(36)}`;
 }
 export const VoiceIdEnrollment: React.FC = () => {
  const [status, setStatus] = useState<Status>({ state: 'loading' });
  const [samples, setSamples] = useState<Sample[]>([]);
  const [recording, setRecording] = useState(false);
  const [recordCountdown, setRecordCountdown] = useState(0);
  const [enrollPending, setEnrollPending] = useState(false);
  const [pendingReqId, setPendingReqId] = useState<string | null>(null);
  // Status laden
  const refreshStatus = useCallback(() => {
    setStatus({ state: 'loading' });
    const reqId = _newReqId('vid');
    setPendingReqId(reqId);
    rvs.send('voice_id_status_request' as any, { requestId: reqId });
  }, []);
  useEffect(() => {
    refreshStatus();
  }, [refreshStatus]);
  // RVS-Antworten verarbeiten
  useEffect(() => {
    const unsub = rvs.onMessage((msg: any) => {
      if (!msg) return;
      const p = msg.payload || {};
      if (msg.type === 'voice_id_status_response') {
        if (p.ok === false) {
          setStatus({ state: 'error', message: p.error || 'Whisper-Bridge nicht erreichbar' });
          return;
        }
        if (p.enrolled) {
          setStatus({
            state: 'enrolled',
            sampleCount: p.sample_count || 0,
            durations: p.sample_durations_s || [],
            updatedAt: p.updated_at || 0,
            dim: p.embedding_dim || 0,
          });
        } else {
          setStatus({ state: 'unenrolled' });
        }
      } else if (msg.type === 'voice_id_enroll_response') {
        setEnrollPending(false);
        if (p.ok === false) {
          Alert.alert('Enrollment fehlgeschlagen', p.error || 'Unbekannter Fehler');
          return;
        }
        const rejected = (p.rejected || []).length;
        ToastAndroid.show(
          `✓ Stimme gespeichert (${p.sample_count} Samples${rejected ? `, ${rejected} verworfen` : ''})`,
          ToastAndroid.LONG,
        );
        setSamples([]);
        refreshStatus();
      } else if (msg.type === 'voice_id_delete_response') {
        ToastAndroid.show(p.removed ? '✓ Stimme gelöscht' : 'Es war keine gespeichert', ToastAndroid.SHORT);
        refreshStatus();
      }
    });
    return () => unsub();
  }, [refreshStatus]);
  // Ein Sample aufnehmen — fest 4s, dann auto-stop
  const recordSample = useCallback(async () => {
    if (recording || enrollPending) return;
    setRecording(true);
    setRecordCountdown(SAMPLE_DURATION_MS / 1000);
    try {
      const ok = await audioService.startRecording(false);
      if (!ok) {
        ToastAndroid.show('Aufnahme konnte nicht gestartet werden', ToastAndroid.LONG);
        setRecording(false);
        setRecordCountdown(0);
        return;
      }
      // Countdown-Timer (rein UI)
      const tickInterval = setInterval(() => {
        setRecordCountdown(c => Math.max(0, c - 1));
      }, 1000);
      // Auto-Stop nach festen 4s
      await new Promise(r => setTimeout(r, SAMPLE_DURATION_MS));
      clearInterval(tickInterval);
      const result = await audioService.stopRecording();
      setRecordCountdown(0);
      setRecording(false);
      if (!result || !result.base64) {
        ToastAndroid.show('Aufnahme leer — nochmal probieren', ToastAndroid.LONG);
        return;
      }
      setSamples(prev => [...prev, { base64: result.base64, durationMs: result.durationMs }]);
    } catch (err: any) {
      console.warn('[VoiceId] recordSample:', err);
      try { await audioService.cancelRecording(); } catch {}
      setRecording(false);
      setRecordCountdown(0);
      ToastAndroid.show('Aufnahmefehler: ' + (err?.message || err), ToastAndroid.LONG);
    }
  }, [recording, enrollPending]);
  const removeSample = useCallback((idx: number) => {
    setSamples(prev => prev.filter((_, i) => i !== idx));
  }, []);
  const sendEnrollment = useCallback(() => {
    if (samples.length < SAMPLES_REQUIRED) {
      Alert.alert('Noch nicht genug',
        `Bitte mindestens ${SAMPLES_REQUIRED} Samples aufnehmen — aktuell ${samples.length}.`);
      return;
    }
    if (enrollPending) return;
    setEnrollPending(true);
    const reqId = _newReqId('videnroll');
    rvs.send('voice_id_enroll_request' as any, {
      requestId: reqId,
      samples: samples.map(s => s.base64),
    });
    // Sicherheits-Timeout: wenn nach 60s nichts kommt, freigeben
    setTimeout(() => {
      setEnrollPending(prev => {
        if (prev) {
          ToastAndroid.show('Enrollment-Timeout — bitte erneut versuchen', ToastAndroid.LONG);
        }
        return false;
      });
    }, 60_000);
  }, [samples, enrollPending]);
  const deleteFingerprint = useCallback(() => {
    Alert.alert(
      'Stimme löschen?',
      'Danach muss ARIA neu enrolled werden, sonst greift Speaker-ID-Filter nicht.',
      [
        { text: 'Abbrechen', style: 'cancel' },
        {
          text: 'Löschen', style: 'destructive', onPress: () => {
            const reqId = _newReqId('viddel');
            rvs.send('voice_id_delete_request' as any, { requestId: reqId });
          },
        },
      ],
    );
  }, []);
  // ── Render ──────────────────────────────────────────────
  return (
    <ScrollView contentContainerStyle={{ paddingBottom: 30 }}>
      <Text style={s.intro}>
        ARIA erkennt deine Stimme an einem Fingerprint (SpeechBrain ECAPA-TDNN, 192 Dimensionen).
        Andere Sprecher (TV, Hintergrund, andere Personen) werden gefiltert — keine Brain-Calls,
        keine Tokens. {'\n\n'}
        Sprich {SAMPLES_REQUIRED} Mal je {SAMPLE_DURATION_MS / 1000}s ganz normal — verschiedene
        Sätze, ruhige Umgebung empfohlen.
      </Text>
      {/* Status-Karte */}
      <View style={s.card}>
        <Text style={s.cardLabel}>Status</Text>
        {status.state === 'loading' && (
          <View style={{ flexDirection: 'row', alignItems: 'center', gap: 8 }}>
            <ActivityIndicator color="#0096FF" />
            <Text style={s.statusText}>Wird abgefragt...</Text>
          </View>
        )}
        {status.state === 'unenrolled' && (
          <Text style={[s.statusText, { color: '#FFD60A' }]}>○ Nicht enrolled — Stimme einrichten ↓</Text>
        )}
        {status.state === 'enrolled' && (
          <>
            <Text style={[s.statusText, { color: '#34C759' }]}>
              ✓ Enrolled — {status.sampleCount} Samples
              ({status.durations.reduce((a, b) => a + b, 0).toFixed(1)}s gesamt)
            </Text>
            <Text style={s.statusSub}>
              Aktualisiert {new Date(status.updatedAt * 1000).toLocaleString('de-DE')} · dim={status.dim}
            </Text>
          </>
        )}
        {status.state === 'error' && (
          <Text style={[s.statusText, { color: '#FF6E6E' }]}>⚠ {status.message}</Text>
        )}
      </View>
      {/* Aufnahme-Bereich */}
      <View style={s.card}>
        <Text style={s.cardLabel}>Samples ({samples.length}/{SAMPLES_REQUIRED})</Text>
        {samples.length === 0 && !recording && (
          <Text style={s.hint}>Tipp: sprich klare normale Sätze, je 3-4 Sekunden Audio.</Text>
        )}
        {samples.map((sample, idx) => (
          <View key={idx} style={s.sampleRow}>
            <Text style={s.sampleText}>
              Sample {idx + 1} · {(sample.durationMs / 1000).toFixed(1)}s
            </Text>
            <TouchableOpacity onPress={() => removeSample(idx)} disabled={enrollPending}>
              <Text style={{ color: '#FF6E6E', fontSize: 18 }}>✕</Text>
            </TouchableOpacity>
          </View>
        ))}
        <TouchableOpacity
          onPress={recordSample}
          disabled={recording || enrollPending}
          style={[s.recordBtn, (recording || enrollPending) && { opacity: 0.5 }]}
        >
          {recording ? (
            <>
              <ActivityIndicator color="#fff" />
              <Text style={s.recordBtnText}>Aufnahme läuft… {recordCountdown}s</Text>
            </>
          ) : (
            <Text style={s.recordBtnText}>⏺ Sample {samples.length + 1} aufnehmen</Text>
          )}
        </TouchableOpacity>
        {samples.length > 0 && !recording && (
          <TouchableOpacity
            onPress={() => setSamples([])}
            disabled={enrollPending}
            style={s.resetBtn}
          >
            <Text style={s.resetBtnText}>Alle verwerfen</Text>
          </TouchableOpacity>
        )}
      </View>
      {/* Aktionen */}
      <View style={{ flexDirection: 'row', gap: 8, marginTop: 8 }}>
        <TouchableOpacity
          onPress={sendEnrollment}
          disabled={samples.length < SAMPLES_REQUIRED || enrollPending}
          style={[
            s.primaryBtn,
            (samples.length < SAMPLES_REQUIRED || enrollPending) && { opacity: 0.4 },
          ]}
        >
          {enrollPending ? (
            <>
              <ActivityIndicator color="#fff" />
              <Text style={s.primaryBtnText}>Wird verarbeitet…</Text>
            </>
          ) : (
            <Text style={s.primaryBtnText}>
              ✓ Speichern ({samples.length}/{SAMPLES_REQUIRED})
            </Text>
          )}
        </TouchableOpacity>
      </View>
      {/* Verwaltung */}
      {status.state === 'enrolled' && (
        <View style={[s.card, { marginTop: 20 }]}>
          <Text style={s.cardLabel}>Verwaltung</Text>
          <TouchableOpacity onPress={refreshStatus} style={s.secondaryBtn}>
            <Text style={s.secondaryBtnText}>🔄 Status aktualisieren</Text>
          </TouchableOpacity>
          <TouchableOpacity onPress={deleteFingerprint} style={s.dangerBtn}>
            <Text style={s.dangerBtnText}>🗑 Fingerprint löschen (Re-Enrollment nötig)</Text>
          </TouchableOpacity>
        </View>
      )}
    </ScrollView>
  );
 };
 const s = StyleSheet.create({
  intro: {
    color: '#8888AA',
    fontSize: 13,
    lineHeight: 19,
    marginBottom: 16,
    paddingHorizontal: 4,
  },
  card: {
    backgroundColor: 'rgba(30,30,46,0.6)',
    borderRadius: 8,
    padding: 14,
    marginBottom: 10,
  },
  cardLabel: {
    color: '#8888AA',
    fontSize: 11,
    fontWeight: '700',
    textTransform: 'uppercase',
    letterSpacing: 0.5,
    marginBottom: 8,
  },
  statusText: {
    color: '#E0E0F0',
    fontSize: 14,
    fontWeight: '600',
  },
  statusSub: {
    color: '#555570',
    fontSize: 11,
    marginTop: 4,
  },
  hint: {
    color: '#555570',
    fontSize: 12,
    fontStyle: 'italic',
    marginBottom: 8,
  },
  sampleRow: {
    flexDirection: 'row',
    justifyContent: 'space-between',
    alignItems: 'center',
    paddingVertical: 6,
    borderBottomWidth: 1,
    borderColor: '#2A2A3E',
  },
  sampleText: {
    color: '#E0E0F0',
    fontSize: 13,
  },
  recordBtn: {
    flexDirection: 'row',
    alignItems: 'center',
    justifyContent: 'center',
    gap: 8,
    backgroundColor: '#E55C5C',
    borderRadius: 8,
    paddingVertical: 14,
    marginTop: 12,
  },
  recordBtnText: {
    color: '#fff',
    fontSize: 15,
    fontWeight: '700',
  },
  resetBtn: {
    alignItems: 'center',
    paddingVertical: 8,
    marginTop: 6,
  },
  resetBtnText: {
    color: '#FFD60A',
    fontSize: 12,
  },
  primaryBtn: {
    flex: 1,
    flexDirection: 'row',
    alignItems: 'center',
    justifyContent: 'center',
    gap: 8,
    backgroundColor: '#34C759',
    borderRadius: 8,
    paddingVertical: 14,
  },
  primaryBtnText: {
    color: '#fff',
    fontSize: 15,
    fontWeight: '700',
  },
  secondaryBtn: {
    backgroundColor: 'rgba(0,150,255,0.15)',
    borderRadius: 6,
    paddingVertical: 10,
    alignItems: 'center',
    marginTop: 6,
  },
  secondaryBtnText: {
    color: '#0096FF',
    fontSize: 13,
    fontWeight: '600',
  },
  dangerBtn: {
    backgroundColor: 'rgba(229,92,92,0.15)',
    borderRadius: 6,
    paddingVertical: 10,
    alignItems: 'center',
    marginTop: 6,
  },
  dangerBtnText: {
    color: '#E55C5C',
    fontSize: 13,
    fontWeight: '600',
  },
 });
 export default VoiceIdEnrollment;
@@ -91,6 +91,7 @@ import MemoryBrowser from '../components/MemoryBrowser';
 import TriggerBrowser from '../components/TriggerBrowser';
 import SkillBrowser from '../components/SkillBrowser';
 import OAuthBrowser from '../components/OAuthBrowser';
 import VoiceIdEnrollment from '../components/VoiceIdEnrollment';
 import { isVerboseLogging, setVerboseLogging, isDebugLogsToBridge, setDebugLogsToBridge, APP_LOG_EVENT } from '../services/logger';
 import {
  isWakeReadySoundEnabled,
@@ -136,6 +137,7 @@ const SETTINGS_SECTIONS = [
  { id: 'general',      icon: '⚙️', label: 'Allgemein',      desc: 'Betriebsmodus, GPS-Standort' },
  { id: 'voice_input',  icon: '🎙️', label: 'Spracheingabe',  desc: 'Stille-Toleranz, Aufnahmedauer' },
  { id: 'wake_word',    icon: '👂', label: 'Wake-Word',      desc: 'Wake-Word-Auswahl' },
  { id: 'voice_id',     icon: '🎤', label: 'Stimme einrichten', desc: 'Sprecher-Erkennung — nur deine Stimme triggert ARIA' },
  { id: 'voice_output', icon: '🔊', label: 'Sprachausgabe',  desc: 'Stimmen, Pre-Roll, Geschwindigkeit' },
  { id: 'storage',      icon: '📁', label: 'Speicher',       desc: 'Anhang-Speicherort, Auto-Download' },
  { id: 'files',        icon: '📂', label: 'Dateien',        desc: 'ARIA- und User-Dateien — anzeigen, löschen' },
@@ -1836,6 +1838,12 @@ const SettingsScreen: React.FC = () => {
      </View>
      </>)}
      {/* === Voice-ID Enrollment (Sprecher-Erkennung) === */}
      {currentSection === 'voice_id' && (<>
      <Text style={styles.sectionTitle}>Stimme einrichten</Text>
      <VoiceIdEnrollment />
      </>)}
      {/* === Sprachausgabe (geraetelokal) === */}
      {currentSection === 'voice_output' && (<>
      <Text style={styles.sectionTitle}>Sprachausgabe</Text>
@@ -764,6 +764,42 @@
      </div>
    </div>
    <!-- Voice-ID (Sprecher-Erkennung) -->
    <div class="settings-section">
      <h2>Voice-ID (Sprecher-Erkennung)</h2>
      <div style="font-size:11px;color:#8888AA;margin-bottom:8px;">
        ARIA erkennt Stefans Stimme anhand eines Fingerprints (SpeechBrain ECAPA-TDNN).
        Andere Sprecher (TV, Hintergrund-Gespraeche) werden gefiltert — keine Brain-
        Calls, keine Tokens. Enrollment passiert in der App (Settings → Stimme einrichten),
        weil das Handy-Mikro auch im Betrieb hoert.
      </div>
      <div class="card" style="max-width:500px;">
        <div id="voice-id-status" style="font-size:13px;color:#E0E0F0;margin-bottom:10px;">
          Status wird geladen...
        </div>
        <div style="display:flex;align-items:center;gap:12px;margin-bottom:8px;">
          <label style="color:#8888AA;font-size:12px;min-width:130px;">Match-Threshold:</label>
          <input type="range" id="diag-voice-id-threshold" min="0.30" max="0.70" step="0.05" value="0.50"
                 oninput="document.getElementById('voice-id-threshold-display').textContent = this.value"
                 onchange="sendVoiceConfig()"
                 style="flex:1;">
          <span id="voice-id-threshold-display" style="color:#E0E0F0;font-family:monospace;min-width:40px;text-align:right;">0.50</span>
        </div>
        <div style="font-size:10px;color:#555570;margin-bottom:12px;">
          Niedriger = mehr Treffer auch bei Nebengeraeuschen (false-positives).
          Hoeher = strenger, kann Stefan auch mal verpassen. 0.50 ist konservativer Default.
        </div>
        <div style="display:flex;gap:8px;">
          <button class="btn secondary" onclick="refreshVoiceIdStatus()" style="padding:6px 14px;font-size:12px;">
            🔄 Status aktualisieren
          </button>
          <button class="btn danger" onclick="deleteVoiceId()" style="padding:6px 14px;font-size:12px;">
            🗑 Fingerprint löschen
          </button>
        </div>
      </div>
    </div>
    <!-- Runtime-Konfiguration -->
    <div class="settings-section">
      <h2>Runtime-Konfiguration</h2>
@@ -1475,6 +1511,46 @@
          setIfPresent('diag-flux-keyword-raw', msg.fluxKeywordRaw);
          setIfPresent('diag-flux-keyword-switch', msg.fluxKeywordSwitch);
          setIfPresent('diag-flux-hf-token', msg.huggingfaceToken);
          // Voice-ID-Threshold wiederherstellen (Default 0.50)
          if (msg.voiceIdThreshold !== undefined && msg.voiceIdThreshold !== null) {
            const slider = document.getElementById('diag-voice-id-threshold');
            const display = document.getElementById('voice-id-threshold-display');
            if (slider) slider.value = msg.voiceIdThreshold;
            if (display) display.textContent = Number(msg.voiceIdThreshold).toFixed(2);
          }
          return;
        }
        if (msg.type === 'voice_id_status_response') {
          const el = document.getElementById('voice-id-status');
          if (!el) return;
          if (msg.payload && msg.payload.ok === false) {
            el.innerHTML = '<span style="color:#FF6E6E;">⚠ Whisper-Bridge nicht erreichbar: ' +
                           (msg.payload.error || 'unbekannt') + '</span>';
            return;
          }
          const p = msg.payload || msg;
          if (p.enrolled) {
            const when = p.updated_at ? new Date(p.updated_at * 1000).toLocaleString('de-DE') : '?';
            const totalSec = (p.sample_durations_s || []).reduce((a, b) => a + b, 0);
            el.innerHTML = '<span style="color:#34C759;">✓ Enrolled</span> · ' +
                           p.sample_count + ' Samples (' + totalSec.toFixed(1) + 's) · ' +
                           'aktualisiert ' + when + ' · dim=' + (p.embedding_dim || '?');
          } else {
            el.innerHTML = '<span style="color:#FFD60A;">○ Nicht enrolled</span> — ' +
                           'in der App unter "Stimme einrichten" 5-10× je 3s aufnehmen.';
          }
          return;
        }
        if (msg.type === 'voice_id_delete_response') {
          const p = msg.payload || msg;
          if (p.removed) {
            alert('Fingerprint gelöscht — Voice-ID-Gating fällt zurück auf Fail-Open.');
          } else {
            alert('Es war kein Fingerprint vorhanden.');
          }
          refreshVoiceIdStatus();
          return;
        }
@@ -2607,6 +2683,17 @@
      });
    }
    function refreshVoiceIdStatus() {
      const el = document.getElementById('voice-id-status');
      if (el) el.textContent = '⏳ Status wird abgefragt...';
      send({ action: 'voice_id_status' });
    }
    function deleteVoiceId() {
      if (!confirm('Voice-ID-Fingerprint loeschen?\n\nDanach muss in der App neu enrolled werden.')) return;
      send({ action: 'voice_id_delete' });
    }
    function deleteXttsVoice(name) {
      if (!confirm(`Stimme "${name}" endgueltig loeschen?`)) return;
      send({ action: 'xtts_delete_voice', name });
@@ -2823,12 +2910,15 @@
      const fluxKeywordRaw = document.getElementById('diag-flux-keyword-raw')?.value;
      const fluxKeywordSwitch = document.getElementById('diag-flux-keyword-switch')?.value;
      const huggingfaceToken = document.getElementById('diag-flux-hf-token')?.value;
      const voiceIdThresholdRaw = document.getElementById('diag-voice-id-threshold')?.value;
      const voiceIdThreshold = voiceIdThresholdRaw ? parseFloat(voiceIdThresholdRaw) : undefined;
      send({
        action: 'send_voice_config',
        ttsEnabled, xttsVoice, whisperModel,
        f5ttsModel, f5ttsCkptFile, f5ttsVocabFile,
        f5ttsCfgStrength, f5ttsNfeStep,
        fluxDefaultModel, fluxKeywordRaw, fluxKeywordSwitch, huggingfaceToken,
        voiceIdThreshold,
      });
      const statusEl = document.getElementById('voice-status');
      if (statusEl && xttsVoice) {
@@ -3354,6 +3444,7 @@
        loadRuntimeConfig();
        loadOnboardingQR();
        loadOAuthServices();
        refreshVoiceIdStatus();
      } else if (tab === 'brain') {
        loadBrainStatus();
        loadBrainMemoryList();
@@ -2367,6 +2367,12 @@ wss.on("connection", (ws) => {
        if (msg.huggingfaceToken !== undefined) {
          voiceConfig.huggingfaceToken = String(msg.huggingfaceToken || "").trim();
        }
        // Voice-ID Match-Threshold (0.30-0.70). Wird von der whisper-bridge
        // ueber den config-Broadcast aufgenommen — Phase 3 nutzt's beim Gating.
        if (msg.voiceIdThreshold !== undefined && !isNaN(msg.voiceIdThreshold)) {
          const t = parseFloat(msg.voiceIdThreshold);
          if (t >= 0.0 && t <= 1.0) voiceConfig.voiceIdThreshold = t;
        }
        try {
          fs.mkdirSync("/shared/config", { recursive: true });
          fs.writeFileSync("/shared/config/voice_config.json", JSON.stringify(voiceConfig, null, 2));
@@ -2390,6 +2396,15 @@ wss.on("connection", (ws) => {
        handleGetModel(ws);
      } else if (msg.action === "set_model") {
        handleSetModel(ws, msg.model);
      } else if (msg.action === "voice_id_status") {
        // An whisper-bridge weiterleiten + Antwort an Browser zurueck
        const reqId = `vid_${Date.now().toString(36)}`;
        sendToRVS_withResponse("voice_id_status_request", { requestId: reqId },
                               "voice_id_status_response", ws);
      } else if (msg.action === "voice_id_delete") {
        const reqId = `viddel_${Date.now().toString(36)}`;
        sendToRVS_withResponse("voice_id_delete_request", { requestId: reqId },
                               "voice_id_delete_response", ws);
      }
      // get_openclaw_config entfernt — aria-core ist raus.
    } catch {}
@@ -781,6 +781,17 @@ async def run_loop(runner: WhisperRunner, sessions: SessionManager) -> None:
                        # Debug-Toggle: aria-bridge broadcastet jetzt whisperDebugLog
                        # damit Stefan im laufenden Betrieb via Diagnostic-Settings
                        # die Logs an/aus schalten kann.
                        # Voice-ID Match-Threshold (von Diagnostic gesendet) auf das
                        # speaker_id-Modul setzen — wird erst in Phase 3 beim Gating
                        # genutzt, aber persistiert bereits jetzt.
                        if "voiceIdThreshold" in payload:
                            try:
                                t = float(payload.get("voiceIdThreshold", 0.5))
                                if 0.0 <= t <= 1.0:
                                    speaker_id.DEFAULT_THRESHOLD = t
                                    logger.info("[speaker-id] threshold gesetzt: %.2f", t)
                            except (TypeError, ValueError):
                                pass
                        if "whisperDebugLog" in payload:
                            global _DEBUG_LOG_TO_BRIDGE
                            old = _DEBUG_LOG_TO_BRIDGE
@@ -61,10 +61,35 @@ def _ensure_loaded():
    return _model
 def _normalize_audio_bytes(audio_bytes: bytes) -> bytes:
    """Akzeptiert entweder rohes 16kHz int16 LE PCM ODER eine WAV-Datei (RIFF/WAVE).
    Bei WAV wird der Header gestrippt + Format validiert (16kHz / mono / int16).
    Ergebnis: rohes PCM."""
    if (len(audio_bytes) >= 44
        and audio_bytes[:4] == b"RIFF"
        and audio_bytes[8:12] == b"WAVE"):
        import io
        import wave
        with wave.open(io.BytesIO(audio_bytes), "rb") as wav:
            sr = wav.getframerate()
            ch = wav.getnchannels()
            sw = wav.getsampwidth()
            if sr != 16000:
                raise ValueError(f"WAV-Samplerate {sr} != 16000")
            if ch != 1:
                raise ValueError(f"WAV-Kanalzahl {ch} != 1 (mono erwartet)")
            if sw != 2:
                raise ValueError(f"WAV-Sampleweite {sw} != 2 (int16 erwartet)")
            return wav.readframes(wav.getnframes())
    return audio_bytes
 def _audio_bytes_to_tensor(audio_bytes: bytes):
-    """int16 LE PCM (16kHz mono) → Torch-Tensor (1, N), normalisiert auf [-1, 1]."""
+    """int16 LE PCM (16kHz mono) → Torch-Tensor (1, N), normalisiert auf [-1, 1].
    WAV wird vorher auf rohes PCM reduziert (Header strippen)."""
    import torch
-    arr = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
+    raw = _normalize_audio_bytes(audio_bytes)
    arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
    return torch.from_numpy(arr).unsqueeze(0)