feat(speaker-id): Phase 2 — Enrollment-UI (App) + Voice-ID-Section (Diagnostic)

App-Seite:
- VoiceIdEnrollment.tsx (neue Komponente, ~370 Zeilen): Status-Karte
  (loading/unenrolled/enrolled/error), Sample-Recorder mit Countdown
  (4s fest pro Sample), Liste mit einzelnem Loeschen, Save-Button
  (disabled bis 5 Samples), Fingerprint-Delete mit Confirm.
- SettingsScreen.tsx: neue Section 🎤 'Stimme einrichten' zwischen
  Wake-Word und Sprachausgabe.
- Sample-Format: WAV via audioService.startRecording — wird
  whisper-bridge-seitig per wave-Modul gestrippt.

Diagnostic-Seite:
- Neue settings-section 'Voice-ID (Sprecher-Erkennung)': Status-Anzeige
  (live ueber voice_id_status_response), Threshold-Slider 0.30-0.70
  (persistiert in voice_config.json, broadcast als config-Message),
  Refresh + Delete-Button.
- server.js: 2 neue actions (voice_id_status, voice_id_delete),
  send_voice_config nimmt voiceIdThreshold mit auf.

Backend:
- speaker_id.py: _normalize_audio_bytes erkennt jetzt WAV-Header
  (RIFF/WAVE) und strippt auf rohes PCM — sonst werfen die ECAPA-
  Embeddings auf den 44-Byte-Header rein.
- bridge.py: config-Broadcast-Handler setzt voiceIdThreshold auf
  speaker_id.DEFAULT_THRESHOLD (wird erst in Phase 3 beim Gating
  genutzt, persistiert aber schon).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-06-06 20:36:06 +02:00
parent 6e19adab87
commit e3fe27f736
6 changed files with 578 additions and 2 deletions
@@ -0,0 +1,426 @@
/**
* Voice-ID Enrollment + Status — App-seitig.
*
* User nimmt 5-7 Samples (je 4s) seiner Stimme auf, App schickt sie an
* die whisper-bridge via RVS (voice_id_enroll_request). Bridge berechnet
* SpeechBrain-ECAPA-Embeddings, mittelt sie zu einem Fingerprint, speichert
* /voice-id/fingerprint.json.
*
* Verwendung: in SettingsScreen für Section 'voice_id' eingebunden.
* Holt Status bei Mount + nach jedem Enroll/Delete neu ab.
*/
import React, { useCallback, useEffect, useState } from 'react';
import {
ActivityIndicator,
Alert,
ScrollView,
StyleSheet,
Text,
ToastAndroid,
TouchableOpacity,
View,
} from 'react-native';
import audioService from '../services/audio';
import rvs from '../services/rvs';
const SAMPLE_DURATION_MS = 4000; // Pro Sample 4s aufnehmen
const SAMPLES_REQUIRED = 5; // Mindest-Sampleanzahl fuer Save
type Sample = {
base64: string;
durationMs: number;
};
type Status =
| { state: 'loading' }
| { state: 'unenrolled' }
| { state: 'enrolled'; sampleCount: number; durations: number[]; updatedAt: number; dim: number }
| { state: 'error'; message: string };
function _newReqId(prefix: string): string {
return `${prefix}_${Date.now().toString(36)}_${Math.floor(Math.random() * 1e6).toString(36)}`;
}
export const VoiceIdEnrollment: React.FC = () => {
const [status, setStatus] = useState<Status>({ state: 'loading' });
const [samples, setSamples] = useState<Sample[]>([]);
const [recording, setRecording] = useState(false);
const [recordCountdown, setRecordCountdown] = useState(0);
const [enrollPending, setEnrollPending] = useState(false);
const [pendingReqId, setPendingReqId] = useState<string | null>(null);
// Status laden
const refreshStatus = useCallback(() => {
setStatus({ state: 'loading' });
const reqId = _newReqId('vid');
setPendingReqId(reqId);
rvs.send('voice_id_status_request' as any, { requestId: reqId });
}, []);
useEffect(() => {
refreshStatus();
}, [refreshStatus]);
// RVS-Antworten verarbeiten
useEffect(() => {
const unsub = rvs.onMessage((msg: any) => {
if (!msg) return;
const p = msg.payload || {};
if (msg.type === 'voice_id_status_response') {
if (p.ok === false) {
setStatus({ state: 'error', message: p.error || 'Whisper-Bridge nicht erreichbar' });
return;
}
if (p.enrolled) {
setStatus({
state: 'enrolled',
sampleCount: p.sample_count || 0,
durations: p.sample_durations_s || [],
updatedAt: p.updated_at || 0,
dim: p.embedding_dim || 0,
});
} else {
setStatus({ state: 'unenrolled' });
}
} else if (msg.type === 'voice_id_enroll_response') {
setEnrollPending(false);
if (p.ok === false) {
Alert.alert('Enrollment fehlgeschlagen', p.error || 'Unbekannter Fehler');
return;
}
const rejected = (p.rejected || []).length;
ToastAndroid.show(
`✓ Stimme gespeichert (${p.sample_count} Samples${rejected ? `, ${rejected} verworfen` : ''})`,
ToastAndroid.LONG,
);
setSamples([]);
refreshStatus();
} else if (msg.type === 'voice_id_delete_response') {
ToastAndroid.show(p.removed ? '✓ Stimme gelöscht' : 'Es war keine gespeichert', ToastAndroid.SHORT);
refreshStatus();
}
});
return () => unsub();
}, [refreshStatus]);
// Ein Sample aufnehmen — fest 4s, dann auto-stop
const recordSample = useCallback(async () => {
if (recording || enrollPending) return;
setRecording(true);
setRecordCountdown(SAMPLE_DURATION_MS / 1000);
try {
const ok = await audioService.startRecording(false);
if (!ok) {
ToastAndroid.show('Aufnahme konnte nicht gestartet werden', ToastAndroid.LONG);
setRecording(false);
setRecordCountdown(0);
return;
}
// Countdown-Timer (rein UI)
const tickInterval = setInterval(() => {
setRecordCountdown(c => Math.max(0, c - 1));
}, 1000);
// Auto-Stop nach festen 4s
await new Promise(r => setTimeout(r, SAMPLE_DURATION_MS));
clearInterval(tickInterval);
const result = await audioService.stopRecording();
setRecordCountdown(0);
setRecording(false);
if (!result || !result.base64) {
ToastAndroid.show('Aufnahme leer — nochmal probieren', ToastAndroid.LONG);
return;
}
setSamples(prev => [...prev, { base64: result.base64, durationMs: result.durationMs }]);
} catch (err: any) {
console.warn('[VoiceId] recordSample:', err);
try { await audioService.cancelRecording(); } catch {}
setRecording(false);
setRecordCountdown(0);
ToastAndroid.show('Aufnahmefehler: ' + (err?.message || err), ToastAndroid.LONG);
}
}, [recording, enrollPending]);
const removeSample = useCallback((idx: number) => {
setSamples(prev => prev.filter((_, i) => i !== idx));
}, []);
const sendEnrollment = useCallback(() => {
if (samples.length < SAMPLES_REQUIRED) {
Alert.alert('Noch nicht genug',
`Bitte mindestens ${SAMPLES_REQUIRED} Samples aufnehmen — aktuell ${samples.length}.`);
return;
}
if (enrollPending) return;
setEnrollPending(true);
const reqId = _newReqId('videnroll');
rvs.send('voice_id_enroll_request' as any, {
requestId: reqId,
samples: samples.map(s => s.base64),
});
// Sicherheits-Timeout: wenn nach 60s nichts kommt, freigeben
setTimeout(() => {
setEnrollPending(prev => {
if (prev) {
ToastAndroid.show('Enrollment-Timeout — bitte erneut versuchen', ToastAndroid.LONG);
}
return false;
});
}, 60_000);
}, [samples, enrollPending]);
const deleteFingerprint = useCallback(() => {
Alert.alert(
'Stimme löschen?',
'Danach muss ARIA neu enrolled werden, sonst greift Speaker-ID-Filter nicht.',
[
{ text: 'Abbrechen', style: 'cancel' },
{
text: 'Löschen', style: 'destructive', onPress: () => {
const reqId = _newReqId('viddel');
rvs.send('voice_id_delete_request' as any, { requestId: reqId });
},
},
],
);
}, []);
// ── Render ──────────────────────────────────────────────
return (
<ScrollView contentContainerStyle={{ paddingBottom: 30 }}>
<Text style={s.intro}>
ARIA erkennt deine Stimme an einem Fingerprint (SpeechBrain ECAPA-TDNN, 192 Dimensionen).
Andere Sprecher (TV, Hintergrund, andere Personen) werden gefiltert keine Brain-Calls,
keine Tokens. {'\n\n'}
Sprich {SAMPLES_REQUIRED} Mal je {SAMPLE_DURATION_MS / 1000}s ganz normal verschiedene
Sätze, ruhige Umgebung empfohlen.
</Text>
{/* Status-Karte */}
<View style={s.card}>
<Text style={s.cardLabel}>Status</Text>
{status.state === 'loading' && (
<View style={{ flexDirection: 'row', alignItems: 'center', gap: 8 }}>
<ActivityIndicator color="#0096FF" />
<Text style={s.statusText}>Wird abgefragt...</Text>
</View>
)}
{status.state === 'unenrolled' && (
<Text style={[s.statusText, { color: '#FFD60A' }]}> Nicht enrolled Stimme einrichten </Text>
)}
{status.state === 'enrolled' && (
<>
<Text style={[s.statusText, { color: '#34C759' }]}>
Enrolled {status.sampleCount} Samples
({status.durations.reduce((a, b) => a + b, 0).toFixed(1)}s gesamt)
</Text>
<Text style={s.statusSub}>
Aktualisiert {new Date(status.updatedAt * 1000).toLocaleString('de-DE')} · dim={status.dim}
</Text>
</>
)}
{status.state === 'error' && (
<Text style={[s.statusText, { color: '#FF6E6E' }]}> {status.message}</Text>
)}
</View>
{/* Aufnahme-Bereich */}
<View style={s.card}>
<Text style={s.cardLabel}>Samples ({samples.length}/{SAMPLES_REQUIRED})</Text>
{samples.length === 0 && !recording && (
<Text style={s.hint}>Tipp: sprich klare normale Sätze, je 3-4 Sekunden Audio.</Text>
)}
{samples.map((sample, idx) => (
<View key={idx} style={s.sampleRow}>
<Text style={s.sampleText}>
Sample {idx + 1} · {(sample.durationMs / 1000).toFixed(1)}s
</Text>
<TouchableOpacity onPress={() => removeSample(idx)} disabled={enrollPending}>
<Text style={{ color: '#FF6E6E', fontSize: 18 }}></Text>
</TouchableOpacity>
</View>
))}
<TouchableOpacity
onPress={recordSample}
disabled={recording || enrollPending}
style={[s.recordBtn, (recording || enrollPending) && { opacity: 0.5 }]}
>
{recording ? (
<>
<ActivityIndicator color="#fff" />
<Text style={s.recordBtnText}>Aufnahme läuft {recordCountdown}s</Text>
</>
) : (
<Text style={s.recordBtnText}> Sample {samples.length + 1} aufnehmen</Text>
)}
</TouchableOpacity>
{samples.length > 0 && !recording && (
<TouchableOpacity
onPress={() => setSamples([])}
disabled={enrollPending}
style={s.resetBtn}
>
<Text style={s.resetBtnText}>Alle verwerfen</Text>
</TouchableOpacity>
)}
</View>
{/* Aktionen */}
<View style={{ flexDirection: 'row', gap: 8, marginTop: 8 }}>
<TouchableOpacity
onPress={sendEnrollment}
disabled={samples.length < SAMPLES_REQUIRED || enrollPending}
style={[
s.primaryBtn,
(samples.length < SAMPLES_REQUIRED || enrollPending) && { opacity: 0.4 },
]}
>
{enrollPending ? (
<>
<ActivityIndicator color="#fff" />
<Text style={s.primaryBtnText}>Wird verarbeitet</Text>
</>
) : (
<Text style={s.primaryBtnText}>
Speichern ({samples.length}/{SAMPLES_REQUIRED})
</Text>
)}
</TouchableOpacity>
</View>
{/* Verwaltung */}
{status.state === 'enrolled' && (
<View style={[s.card, { marginTop: 20 }]}>
<Text style={s.cardLabel}>Verwaltung</Text>
<TouchableOpacity onPress={refreshStatus} style={s.secondaryBtn}>
<Text style={s.secondaryBtnText}>🔄 Status aktualisieren</Text>
</TouchableOpacity>
<TouchableOpacity onPress={deleteFingerprint} style={s.dangerBtn}>
<Text style={s.dangerBtnText}>🗑 Fingerprint löschen (Re-Enrollment nötig)</Text>
</TouchableOpacity>
</View>
)}
</ScrollView>
);
};
const s = StyleSheet.create({
intro: {
color: '#8888AA',
fontSize: 13,
lineHeight: 19,
marginBottom: 16,
paddingHorizontal: 4,
},
card: {
backgroundColor: 'rgba(30,30,46,0.6)',
borderRadius: 8,
padding: 14,
marginBottom: 10,
},
cardLabel: {
color: '#8888AA',
fontSize: 11,
fontWeight: '700',
textTransform: 'uppercase',
letterSpacing: 0.5,
marginBottom: 8,
},
statusText: {
color: '#E0E0F0',
fontSize: 14,
fontWeight: '600',
},
statusSub: {
color: '#555570',
fontSize: 11,
marginTop: 4,
},
hint: {
color: '#555570',
fontSize: 12,
fontStyle: 'italic',
marginBottom: 8,
},
sampleRow: {
flexDirection: 'row',
justifyContent: 'space-between',
alignItems: 'center',
paddingVertical: 6,
borderBottomWidth: 1,
borderColor: '#2A2A3E',
},
sampleText: {
color: '#E0E0F0',
fontSize: 13,
},
recordBtn: {
flexDirection: 'row',
alignItems: 'center',
justifyContent: 'center',
gap: 8,
backgroundColor: '#E55C5C',
borderRadius: 8,
paddingVertical: 14,
marginTop: 12,
},
recordBtnText: {
color: '#fff',
fontSize: 15,
fontWeight: '700',
},
resetBtn: {
alignItems: 'center',
paddingVertical: 8,
marginTop: 6,
},
resetBtnText: {
color: '#FFD60A',
fontSize: 12,
},
primaryBtn: {
flex: 1,
flexDirection: 'row',
alignItems: 'center',
justifyContent: 'center',
gap: 8,
backgroundColor: '#34C759',
borderRadius: 8,
paddingVertical: 14,
},
primaryBtnText: {
color: '#fff',
fontSize: 15,
fontWeight: '700',
},
secondaryBtn: {
backgroundColor: 'rgba(0,150,255,0.15)',
borderRadius: 6,
paddingVertical: 10,
alignItems: 'center',
marginTop: 6,
},
secondaryBtnText: {
color: '#0096FF',
fontSize: 13,
fontWeight: '600',
},
dangerBtn: {
backgroundColor: 'rgba(229,92,92,0.15)',
borderRadius: 6,
paddingVertical: 10,
alignItems: 'center',
marginTop: 6,
},
dangerBtnText: {
color: '#E55C5C',
fontSize: 13,
fontWeight: '600',
},
});
export default VoiceIdEnrollment;
+8
View File
@@ -91,6 +91,7 @@ import MemoryBrowser from '../components/MemoryBrowser';
import TriggerBrowser from '../components/TriggerBrowser';
import SkillBrowser from '../components/SkillBrowser';
import OAuthBrowser from '../components/OAuthBrowser';
import VoiceIdEnrollment from '../components/VoiceIdEnrollment';
import { isVerboseLogging, setVerboseLogging, isDebugLogsToBridge, setDebugLogsToBridge, APP_LOG_EVENT } from '../services/logger';
import {
isWakeReadySoundEnabled,
@@ -136,6 +137,7 @@ const SETTINGS_SECTIONS = [
{ id: 'general', icon: '⚙️', label: 'Allgemein', desc: 'Betriebsmodus, GPS-Standort' },
{ id: 'voice_input', icon: '🎙️', label: 'Spracheingabe', desc: 'Stille-Toleranz, Aufnahmedauer' },
{ id: 'wake_word', icon: '👂', label: 'Wake-Word', desc: 'Wake-Word-Auswahl' },
{ id: 'voice_id', icon: '🎤', label: 'Stimme einrichten', desc: 'Sprecher-Erkennung — nur deine Stimme triggert ARIA' },
{ id: 'voice_output', icon: '🔊', label: 'Sprachausgabe', desc: 'Stimmen, Pre-Roll, Geschwindigkeit' },
{ id: 'storage', icon: '📁', label: 'Speicher', desc: 'Anhang-Speicherort, Auto-Download' },
{ id: 'files', icon: '📂', label: 'Dateien', desc: 'ARIA- und User-Dateien — anzeigen, löschen' },
@@ -1836,6 +1838,12 @@ const SettingsScreen: React.FC = () => {
</View>
</>)}
{/* === Voice-ID Enrollment (Sprecher-Erkennung) === */}
{currentSection === 'voice_id' && (<>
<Text style={styles.sectionTitle}>Stimme einrichten</Text>
<VoiceIdEnrollment />
</>)}
{/* === Sprachausgabe (geraetelokal) === */}
{currentSection === 'voice_output' && (<>
<Text style={styles.sectionTitle}>Sprachausgabe</Text>