feat(speaker-id): Phase 2 — Enrollment-UI (App) + Voice-ID-Section (Diagnostic)
App-Seite: - VoiceIdEnrollment.tsx (neue Komponente, ~370 Zeilen): Status-Karte (loading/unenrolled/enrolled/error), Sample-Recorder mit Countdown (4s fest pro Sample), Liste mit einzelnem Loeschen, Save-Button (disabled bis 5 Samples), Fingerprint-Delete mit Confirm. - SettingsScreen.tsx: neue Section 🎤 'Stimme einrichten' zwischen Wake-Word und Sprachausgabe. - Sample-Format: WAV via audioService.startRecording — wird whisper-bridge-seitig per wave-Modul gestrippt. Diagnostic-Seite: - Neue settings-section 'Voice-ID (Sprecher-Erkennung)': Status-Anzeige (live ueber voice_id_status_response), Threshold-Slider 0.30-0.70 (persistiert in voice_config.json, broadcast als config-Message), Refresh + Delete-Button. - server.js: 2 neue actions (voice_id_status, voice_id_delete), send_voice_config nimmt voiceIdThreshold mit auf. Backend: - speaker_id.py: _normalize_audio_bytes erkennt jetzt WAV-Header (RIFF/WAVE) und strippt auf rohes PCM — sonst werfen die ECAPA- Embeddings auf den 44-Byte-Header rein. - bridge.py: config-Broadcast-Handler setzt voiceIdThreshold auf speaker_id.DEFAULT_THRESHOLD (wird erst in Phase 3 beim Gating genutzt, persistiert aber schon). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,426 @@
|
|||||||
|
/**
|
||||||
|
* Voice-ID Enrollment + Status — App-seitig.
|
||||||
|
*
|
||||||
|
* User nimmt 5-7 Samples (je 4s) seiner Stimme auf, App schickt sie an
|
||||||
|
* die whisper-bridge via RVS (voice_id_enroll_request). Bridge berechnet
|
||||||
|
* SpeechBrain-ECAPA-Embeddings, mittelt sie zu einem Fingerprint, speichert
|
||||||
|
* /voice-id/fingerprint.json.
|
||||||
|
*
|
||||||
|
* Verwendung: in SettingsScreen für Section 'voice_id' eingebunden.
|
||||||
|
* Holt Status bei Mount + nach jedem Enroll/Delete neu ab.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import React, { useCallback, useEffect, useState } from 'react';
|
||||||
|
import {
|
||||||
|
ActivityIndicator,
|
||||||
|
Alert,
|
||||||
|
ScrollView,
|
||||||
|
StyleSheet,
|
||||||
|
Text,
|
||||||
|
ToastAndroid,
|
||||||
|
TouchableOpacity,
|
||||||
|
View,
|
||||||
|
} from 'react-native';
|
||||||
|
|
||||||
|
import audioService from '../services/audio';
|
||||||
|
import rvs from '../services/rvs';
|
||||||
|
|
||||||
|
const SAMPLE_DURATION_MS = 4000; // Pro Sample 4s aufnehmen
|
||||||
|
const SAMPLES_REQUIRED = 5; // Mindest-Sampleanzahl fuer Save
|
||||||
|
|
||||||
|
type Sample = {
|
||||||
|
base64: string;
|
||||||
|
durationMs: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
type Status =
|
||||||
|
| { state: 'loading' }
|
||||||
|
| { state: 'unenrolled' }
|
||||||
|
| { state: 'enrolled'; sampleCount: number; durations: number[]; updatedAt: number; dim: number }
|
||||||
|
| { state: 'error'; message: string };
|
||||||
|
|
||||||
|
function _newReqId(prefix: string): string {
|
||||||
|
return `${prefix}_${Date.now().toString(36)}_${Math.floor(Math.random() * 1e6).toString(36)}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const VoiceIdEnrollment: React.FC = () => {
|
||||||
|
const [status, setStatus] = useState<Status>({ state: 'loading' });
|
||||||
|
const [samples, setSamples] = useState<Sample[]>([]);
|
||||||
|
const [recording, setRecording] = useState(false);
|
||||||
|
const [recordCountdown, setRecordCountdown] = useState(0);
|
||||||
|
const [enrollPending, setEnrollPending] = useState(false);
|
||||||
|
const [pendingReqId, setPendingReqId] = useState<string | null>(null);
|
||||||
|
|
||||||
|
// Status laden
|
||||||
|
const refreshStatus = useCallback(() => {
|
||||||
|
setStatus({ state: 'loading' });
|
||||||
|
const reqId = _newReqId('vid');
|
||||||
|
setPendingReqId(reqId);
|
||||||
|
rvs.send('voice_id_status_request' as any, { requestId: reqId });
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
refreshStatus();
|
||||||
|
}, [refreshStatus]);
|
||||||
|
|
||||||
|
// RVS-Antworten verarbeiten
|
||||||
|
useEffect(() => {
|
||||||
|
const unsub = rvs.onMessage((msg: any) => {
|
||||||
|
if (!msg) return;
|
||||||
|
const p = msg.payload || {};
|
||||||
|
if (msg.type === 'voice_id_status_response') {
|
||||||
|
if (p.ok === false) {
|
||||||
|
setStatus({ state: 'error', message: p.error || 'Whisper-Bridge nicht erreichbar' });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (p.enrolled) {
|
||||||
|
setStatus({
|
||||||
|
state: 'enrolled',
|
||||||
|
sampleCount: p.sample_count || 0,
|
||||||
|
durations: p.sample_durations_s || [],
|
||||||
|
updatedAt: p.updated_at || 0,
|
||||||
|
dim: p.embedding_dim || 0,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
setStatus({ state: 'unenrolled' });
|
||||||
|
}
|
||||||
|
} else if (msg.type === 'voice_id_enroll_response') {
|
||||||
|
setEnrollPending(false);
|
||||||
|
if (p.ok === false) {
|
||||||
|
Alert.alert('Enrollment fehlgeschlagen', p.error || 'Unbekannter Fehler');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const rejected = (p.rejected || []).length;
|
||||||
|
ToastAndroid.show(
|
||||||
|
`✓ Stimme gespeichert (${p.sample_count} Samples${rejected ? `, ${rejected} verworfen` : ''})`,
|
||||||
|
ToastAndroid.LONG,
|
||||||
|
);
|
||||||
|
setSamples([]);
|
||||||
|
refreshStatus();
|
||||||
|
} else if (msg.type === 'voice_id_delete_response') {
|
||||||
|
ToastAndroid.show(p.removed ? '✓ Stimme gelöscht' : 'Es war keine gespeichert', ToastAndroid.SHORT);
|
||||||
|
refreshStatus();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return () => unsub();
|
||||||
|
}, [refreshStatus]);
|
||||||
|
|
||||||
|
// Ein Sample aufnehmen — fest 4s, dann auto-stop
|
||||||
|
const recordSample = useCallback(async () => {
|
||||||
|
if (recording || enrollPending) return;
|
||||||
|
setRecording(true);
|
||||||
|
setRecordCountdown(SAMPLE_DURATION_MS / 1000);
|
||||||
|
try {
|
||||||
|
const ok = await audioService.startRecording(false);
|
||||||
|
if (!ok) {
|
||||||
|
ToastAndroid.show('Aufnahme konnte nicht gestartet werden', ToastAndroid.LONG);
|
||||||
|
setRecording(false);
|
||||||
|
setRecordCountdown(0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Countdown-Timer (rein UI)
|
||||||
|
const tickInterval = setInterval(() => {
|
||||||
|
setRecordCountdown(c => Math.max(0, c - 1));
|
||||||
|
}, 1000);
|
||||||
|
// Auto-Stop nach festen 4s
|
||||||
|
await new Promise(r => setTimeout(r, SAMPLE_DURATION_MS));
|
||||||
|
clearInterval(tickInterval);
|
||||||
|
const result = await audioService.stopRecording();
|
||||||
|
setRecordCountdown(0);
|
||||||
|
setRecording(false);
|
||||||
|
if (!result || !result.base64) {
|
||||||
|
ToastAndroid.show('Aufnahme leer — nochmal probieren', ToastAndroid.LONG);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
setSamples(prev => [...prev, { base64: result.base64, durationMs: result.durationMs }]);
|
||||||
|
} catch (err: any) {
|
||||||
|
console.warn('[VoiceId] recordSample:', err);
|
||||||
|
try { await audioService.cancelRecording(); } catch {}
|
||||||
|
setRecording(false);
|
||||||
|
setRecordCountdown(0);
|
||||||
|
ToastAndroid.show('Aufnahmefehler: ' + (err?.message || err), ToastAndroid.LONG);
|
||||||
|
}
|
||||||
|
}, [recording, enrollPending]);
|
||||||
|
|
||||||
|
const removeSample = useCallback((idx: number) => {
|
||||||
|
setSamples(prev => prev.filter((_, i) => i !== idx));
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const sendEnrollment = useCallback(() => {
|
||||||
|
if (samples.length < SAMPLES_REQUIRED) {
|
||||||
|
Alert.alert('Noch nicht genug',
|
||||||
|
`Bitte mindestens ${SAMPLES_REQUIRED} Samples aufnehmen — aktuell ${samples.length}.`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (enrollPending) return;
|
||||||
|
setEnrollPending(true);
|
||||||
|
const reqId = _newReqId('videnroll');
|
||||||
|
rvs.send('voice_id_enroll_request' as any, {
|
||||||
|
requestId: reqId,
|
||||||
|
samples: samples.map(s => s.base64),
|
||||||
|
});
|
||||||
|
// Sicherheits-Timeout: wenn nach 60s nichts kommt, freigeben
|
||||||
|
setTimeout(() => {
|
||||||
|
setEnrollPending(prev => {
|
||||||
|
if (prev) {
|
||||||
|
ToastAndroid.show('Enrollment-Timeout — bitte erneut versuchen', ToastAndroid.LONG);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
}, 60_000);
|
||||||
|
}, [samples, enrollPending]);
|
||||||
|
|
||||||
|
const deleteFingerprint = useCallback(() => {
|
||||||
|
Alert.alert(
|
||||||
|
'Stimme löschen?',
|
||||||
|
'Danach muss ARIA neu enrolled werden, sonst greift Speaker-ID-Filter nicht.',
|
||||||
|
[
|
||||||
|
{ text: 'Abbrechen', style: 'cancel' },
|
||||||
|
{
|
||||||
|
text: 'Löschen', style: 'destructive', onPress: () => {
|
||||||
|
const reqId = _newReqId('viddel');
|
||||||
|
rvs.send('voice_id_delete_request' as any, { requestId: reqId });
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
// ── Render ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
return (
|
||||||
|
<ScrollView contentContainerStyle={{ paddingBottom: 30 }}>
|
||||||
|
<Text style={s.intro}>
|
||||||
|
ARIA erkennt deine Stimme an einem Fingerprint (SpeechBrain ECAPA-TDNN, 192 Dimensionen).
|
||||||
|
Andere Sprecher (TV, Hintergrund, andere Personen) werden gefiltert — keine Brain-Calls,
|
||||||
|
keine Tokens. {'\n\n'}
|
||||||
|
Sprich {SAMPLES_REQUIRED} Mal je {SAMPLE_DURATION_MS / 1000}s ganz normal — verschiedene
|
||||||
|
Sätze, ruhige Umgebung empfohlen.
|
||||||
|
</Text>
|
||||||
|
|
||||||
|
{/* Status-Karte */}
|
||||||
|
<View style={s.card}>
|
||||||
|
<Text style={s.cardLabel}>Status</Text>
|
||||||
|
{status.state === 'loading' && (
|
||||||
|
<View style={{ flexDirection: 'row', alignItems: 'center', gap: 8 }}>
|
||||||
|
<ActivityIndicator color="#0096FF" />
|
||||||
|
<Text style={s.statusText}>Wird abgefragt...</Text>
|
||||||
|
</View>
|
||||||
|
)}
|
||||||
|
{status.state === 'unenrolled' && (
|
||||||
|
<Text style={[s.statusText, { color: '#FFD60A' }]}>○ Nicht enrolled — Stimme einrichten ↓</Text>
|
||||||
|
)}
|
||||||
|
{status.state === 'enrolled' && (
|
||||||
|
<>
|
||||||
|
<Text style={[s.statusText, { color: '#34C759' }]}>
|
||||||
|
✓ Enrolled — {status.sampleCount} Samples
|
||||||
|
({status.durations.reduce((a, b) => a + b, 0).toFixed(1)}s gesamt)
|
||||||
|
</Text>
|
||||||
|
<Text style={s.statusSub}>
|
||||||
|
Aktualisiert {new Date(status.updatedAt * 1000).toLocaleString('de-DE')} · dim={status.dim}
|
||||||
|
</Text>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
{status.state === 'error' && (
|
||||||
|
<Text style={[s.statusText, { color: '#FF6E6E' }]}>⚠ {status.message}</Text>
|
||||||
|
)}
|
||||||
|
</View>
|
||||||
|
|
||||||
|
{/* Aufnahme-Bereich */}
|
||||||
|
<View style={s.card}>
|
||||||
|
<Text style={s.cardLabel}>Samples ({samples.length}/{SAMPLES_REQUIRED})</Text>
|
||||||
|
{samples.length === 0 && !recording && (
|
||||||
|
<Text style={s.hint}>Tipp: sprich klare normale Sätze, je 3-4 Sekunden Audio.</Text>
|
||||||
|
)}
|
||||||
|
{samples.map((sample, idx) => (
|
||||||
|
<View key={idx} style={s.sampleRow}>
|
||||||
|
<Text style={s.sampleText}>
|
||||||
|
Sample {idx + 1} · {(sample.durationMs / 1000).toFixed(1)}s
|
||||||
|
</Text>
|
||||||
|
<TouchableOpacity onPress={() => removeSample(idx)} disabled={enrollPending}>
|
||||||
|
<Text style={{ color: '#FF6E6E', fontSize: 18 }}>✕</Text>
|
||||||
|
</TouchableOpacity>
|
||||||
|
</View>
|
||||||
|
))}
|
||||||
|
|
||||||
|
<TouchableOpacity
|
||||||
|
onPress={recordSample}
|
||||||
|
disabled={recording || enrollPending}
|
||||||
|
style={[s.recordBtn, (recording || enrollPending) && { opacity: 0.5 }]}
|
||||||
|
>
|
||||||
|
{recording ? (
|
||||||
|
<>
|
||||||
|
<ActivityIndicator color="#fff" />
|
||||||
|
<Text style={s.recordBtnText}>Aufnahme läuft… {recordCountdown}s</Text>
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
<Text style={s.recordBtnText}>⏺ Sample {samples.length + 1} aufnehmen</Text>
|
||||||
|
)}
|
||||||
|
</TouchableOpacity>
|
||||||
|
|
||||||
|
{samples.length > 0 && !recording && (
|
||||||
|
<TouchableOpacity
|
||||||
|
onPress={() => setSamples([])}
|
||||||
|
disabled={enrollPending}
|
||||||
|
style={s.resetBtn}
|
||||||
|
>
|
||||||
|
<Text style={s.resetBtnText}>Alle verwerfen</Text>
|
||||||
|
</TouchableOpacity>
|
||||||
|
)}
|
||||||
|
</View>
|
||||||
|
|
||||||
|
{/* Aktionen */}
|
||||||
|
<View style={{ flexDirection: 'row', gap: 8, marginTop: 8 }}>
|
||||||
|
<TouchableOpacity
|
||||||
|
onPress={sendEnrollment}
|
||||||
|
disabled={samples.length < SAMPLES_REQUIRED || enrollPending}
|
||||||
|
style={[
|
||||||
|
s.primaryBtn,
|
||||||
|
(samples.length < SAMPLES_REQUIRED || enrollPending) && { opacity: 0.4 },
|
||||||
|
]}
|
||||||
|
>
|
||||||
|
{enrollPending ? (
|
||||||
|
<>
|
||||||
|
<ActivityIndicator color="#fff" />
|
||||||
|
<Text style={s.primaryBtnText}>Wird verarbeitet…</Text>
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
<Text style={s.primaryBtnText}>
|
||||||
|
✓ Speichern ({samples.length}/{SAMPLES_REQUIRED})
|
||||||
|
</Text>
|
||||||
|
)}
|
||||||
|
</TouchableOpacity>
|
||||||
|
</View>
|
||||||
|
|
||||||
|
{/* Verwaltung */}
|
||||||
|
{status.state === 'enrolled' && (
|
||||||
|
<View style={[s.card, { marginTop: 20 }]}>
|
||||||
|
<Text style={s.cardLabel}>Verwaltung</Text>
|
||||||
|
<TouchableOpacity onPress={refreshStatus} style={s.secondaryBtn}>
|
||||||
|
<Text style={s.secondaryBtnText}>🔄 Status aktualisieren</Text>
|
||||||
|
</TouchableOpacity>
|
||||||
|
<TouchableOpacity onPress={deleteFingerprint} style={s.dangerBtn}>
|
||||||
|
<Text style={s.dangerBtnText}>🗑 Fingerprint löschen (Re-Enrollment nötig)</Text>
|
||||||
|
</TouchableOpacity>
|
||||||
|
</View>
|
||||||
|
)}
|
||||||
|
</ScrollView>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
const s = StyleSheet.create({
|
||||||
|
intro: {
|
||||||
|
color: '#8888AA',
|
||||||
|
fontSize: 13,
|
||||||
|
lineHeight: 19,
|
||||||
|
marginBottom: 16,
|
||||||
|
paddingHorizontal: 4,
|
||||||
|
},
|
||||||
|
card: {
|
||||||
|
backgroundColor: 'rgba(30,30,46,0.6)',
|
||||||
|
borderRadius: 8,
|
||||||
|
padding: 14,
|
||||||
|
marginBottom: 10,
|
||||||
|
},
|
||||||
|
cardLabel: {
|
||||||
|
color: '#8888AA',
|
||||||
|
fontSize: 11,
|
||||||
|
fontWeight: '700',
|
||||||
|
textTransform: 'uppercase',
|
||||||
|
letterSpacing: 0.5,
|
||||||
|
marginBottom: 8,
|
||||||
|
},
|
||||||
|
statusText: {
|
||||||
|
color: '#E0E0F0',
|
||||||
|
fontSize: 14,
|
||||||
|
fontWeight: '600',
|
||||||
|
},
|
||||||
|
statusSub: {
|
||||||
|
color: '#555570',
|
||||||
|
fontSize: 11,
|
||||||
|
marginTop: 4,
|
||||||
|
},
|
||||||
|
hint: {
|
||||||
|
color: '#555570',
|
||||||
|
fontSize: 12,
|
||||||
|
fontStyle: 'italic',
|
||||||
|
marginBottom: 8,
|
||||||
|
},
|
||||||
|
sampleRow: {
|
||||||
|
flexDirection: 'row',
|
||||||
|
justifyContent: 'space-between',
|
||||||
|
alignItems: 'center',
|
||||||
|
paddingVertical: 6,
|
||||||
|
borderBottomWidth: 1,
|
||||||
|
borderColor: '#2A2A3E',
|
||||||
|
},
|
||||||
|
sampleText: {
|
||||||
|
color: '#E0E0F0',
|
||||||
|
fontSize: 13,
|
||||||
|
},
|
||||||
|
recordBtn: {
|
||||||
|
flexDirection: 'row',
|
||||||
|
alignItems: 'center',
|
||||||
|
justifyContent: 'center',
|
||||||
|
gap: 8,
|
||||||
|
backgroundColor: '#E55C5C',
|
||||||
|
borderRadius: 8,
|
||||||
|
paddingVertical: 14,
|
||||||
|
marginTop: 12,
|
||||||
|
},
|
||||||
|
recordBtnText: {
|
||||||
|
color: '#fff',
|
||||||
|
fontSize: 15,
|
||||||
|
fontWeight: '700',
|
||||||
|
},
|
||||||
|
resetBtn: {
|
||||||
|
alignItems: 'center',
|
||||||
|
paddingVertical: 8,
|
||||||
|
marginTop: 6,
|
||||||
|
},
|
||||||
|
resetBtnText: {
|
||||||
|
color: '#FFD60A',
|
||||||
|
fontSize: 12,
|
||||||
|
},
|
||||||
|
primaryBtn: {
|
||||||
|
flex: 1,
|
||||||
|
flexDirection: 'row',
|
||||||
|
alignItems: 'center',
|
||||||
|
justifyContent: 'center',
|
||||||
|
gap: 8,
|
||||||
|
backgroundColor: '#34C759',
|
||||||
|
borderRadius: 8,
|
||||||
|
paddingVertical: 14,
|
||||||
|
},
|
||||||
|
primaryBtnText: {
|
||||||
|
color: '#fff',
|
||||||
|
fontSize: 15,
|
||||||
|
fontWeight: '700',
|
||||||
|
},
|
||||||
|
secondaryBtn: {
|
||||||
|
backgroundColor: 'rgba(0,150,255,0.15)',
|
||||||
|
borderRadius: 6,
|
||||||
|
paddingVertical: 10,
|
||||||
|
alignItems: 'center',
|
||||||
|
marginTop: 6,
|
||||||
|
},
|
||||||
|
secondaryBtnText: {
|
||||||
|
color: '#0096FF',
|
||||||
|
fontSize: 13,
|
||||||
|
fontWeight: '600',
|
||||||
|
},
|
||||||
|
dangerBtn: {
|
||||||
|
backgroundColor: 'rgba(229,92,92,0.15)',
|
||||||
|
borderRadius: 6,
|
||||||
|
paddingVertical: 10,
|
||||||
|
alignItems: 'center',
|
||||||
|
marginTop: 6,
|
||||||
|
},
|
||||||
|
dangerBtnText: {
|
||||||
|
color: '#E55C5C',
|
||||||
|
fontSize: 13,
|
||||||
|
fontWeight: '600',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
export default VoiceIdEnrollment;
|
||||||
@@ -91,6 +91,7 @@ import MemoryBrowser from '../components/MemoryBrowser';
|
|||||||
import TriggerBrowser from '../components/TriggerBrowser';
|
import TriggerBrowser from '../components/TriggerBrowser';
|
||||||
import SkillBrowser from '../components/SkillBrowser';
|
import SkillBrowser from '../components/SkillBrowser';
|
||||||
import OAuthBrowser from '../components/OAuthBrowser';
|
import OAuthBrowser from '../components/OAuthBrowser';
|
||||||
|
import VoiceIdEnrollment from '../components/VoiceIdEnrollment';
|
||||||
import { isVerboseLogging, setVerboseLogging, isDebugLogsToBridge, setDebugLogsToBridge, APP_LOG_EVENT } from '../services/logger';
|
import { isVerboseLogging, setVerboseLogging, isDebugLogsToBridge, setDebugLogsToBridge, APP_LOG_EVENT } from '../services/logger';
|
||||||
import {
|
import {
|
||||||
isWakeReadySoundEnabled,
|
isWakeReadySoundEnabled,
|
||||||
@@ -136,6 +137,7 @@ const SETTINGS_SECTIONS = [
|
|||||||
{ id: 'general', icon: '⚙️', label: 'Allgemein', desc: 'Betriebsmodus, GPS-Standort' },
|
{ id: 'general', icon: '⚙️', label: 'Allgemein', desc: 'Betriebsmodus, GPS-Standort' },
|
||||||
{ id: 'voice_input', icon: '🎙️', label: 'Spracheingabe', desc: 'Stille-Toleranz, Aufnahmedauer' },
|
{ id: 'voice_input', icon: '🎙️', label: 'Spracheingabe', desc: 'Stille-Toleranz, Aufnahmedauer' },
|
||||||
{ id: 'wake_word', icon: '👂', label: 'Wake-Word', desc: 'Wake-Word-Auswahl' },
|
{ id: 'wake_word', icon: '👂', label: 'Wake-Word', desc: 'Wake-Word-Auswahl' },
|
||||||
|
{ id: 'voice_id', icon: '🎤', label: 'Stimme einrichten', desc: 'Sprecher-Erkennung — nur deine Stimme triggert ARIA' },
|
||||||
{ id: 'voice_output', icon: '🔊', label: 'Sprachausgabe', desc: 'Stimmen, Pre-Roll, Geschwindigkeit' },
|
{ id: 'voice_output', icon: '🔊', label: 'Sprachausgabe', desc: 'Stimmen, Pre-Roll, Geschwindigkeit' },
|
||||||
{ id: 'storage', icon: '📁', label: 'Speicher', desc: 'Anhang-Speicherort, Auto-Download' },
|
{ id: 'storage', icon: '📁', label: 'Speicher', desc: 'Anhang-Speicherort, Auto-Download' },
|
||||||
{ id: 'files', icon: '📂', label: 'Dateien', desc: 'ARIA- und User-Dateien — anzeigen, löschen' },
|
{ id: 'files', icon: '📂', label: 'Dateien', desc: 'ARIA- und User-Dateien — anzeigen, löschen' },
|
||||||
@@ -1836,6 +1838,12 @@ const SettingsScreen: React.FC = () => {
|
|||||||
</View>
|
</View>
|
||||||
</>)}
|
</>)}
|
||||||
|
|
||||||
|
{/* === Voice-ID Enrollment (Sprecher-Erkennung) === */}
|
||||||
|
{currentSection === 'voice_id' && (<>
|
||||||
|
<Text style={styles.sectionTitle}>Stimme einrichten</Text>
|
||||||
|
<VoiceIdEnrollment />
|
||||||
|
</>)}
|
||||||
|
|
||||||
{/* === Sprachausgabe (geraetelokal) === */}
|
{/* === Sprachausgabe (geraetelokal) === */}
|
||||||
{currentSection === 'voice_output' && (<>
|
{currentSection === 'voice_output' && (<>
|
||||||
<Text style={styles.sectionTitle}>Sprachausgabe</Text>
|
<Text style={styles.sectionTitle}>Sprachausgabe</Text>
|
||||||
|
|||||||
@@ -764,6 +764,42 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Voice-ID (Sprecher-Erkennung) -->
|
||||||
|
<div class="settings-section">
|
||||||
|
<h2>Voice-ID (Sprecher-Erkennung)</h2>
|
||||||
|
<div style="font-size:11px;color:#8888AA;margin-bottom:8px;">
|
||||||
|
ARIA erkennt Stefans Stimme anhand eines Fingerprints (SpeechBrain ECAPA-TDNN).
|
||||||
|
Andere Sprecher (TV, Hintergrund-Gespraeche) werden gefiltert — keine Brain-
|
||||||
|
Calls, keine Tokens. Enrollment passiert in der App (Settings → Stimme einrichten),
|
||||||
|
weil das Handy-Mikro auch im Betrieb hoert.
|
||||||
|
</div>
|
||||||
|
<div class="card" style="max-width:500px;">
|
||||||
|
<div id="voice-id-status" style="font-size:13px;color:#E0E0F0;margin-bottom:10px;">
|
||||||
|
Status wird geladen...
|
||||||
|
</div>
|
||||||
|
<div style="display:flex;align-items:center;gap:12px;margin-bottom:8px;">
|
||||||
|
<label style="color:#8888AA;font-size:12px;min-width:130px;">Match-Threshold:</label>
|
||||||
|
<input type="range" id="diag-voice-id-threshold" min="0.30" max="0.70" step="0.05" value="0.50"
|
||||||
|
oninput="document.getElementById('voice-id-threshold-display').textContent = this.value"
|
||||||
|
onchange="sendVoiceConfig()"
|
||||||
|
style="flex:1;">
|
||||||
|
<span id="voice-id-threshold-display" style="color:#E0E0F0;font-family:monospace;min-width:40px;text-align:right;">0.50</span>
|
||||||
|
</div>
|
||||||
|
<div style="font-size:10px;color:#555570;margin-bottom:12px;">
|
||||||
|
Niedriger = mehr Treffer auch bei Nebengeraeuschen (false-positives).
|
||||||
|
Hoeher = strenger, kann Stefan auch mal verpassen. 0.50 ist konservativer Default.
|
||||||
|
</div>
|
||||||
|
<div style="display:flex;gap:8px;">
|
||||||
|
<button class="btn secondary" onclick="refreshVoiceIdStatus()" style="padding:6px 14px;font-size:12px;">
|
||||||
|
🔄 Status aktualisieren
|
||||||
|
</button>
|
||||||
|
<button class="btn danger" onclick="deleteVoiceId()" style="padding:6px 14px;font-size:12px;">
|
||||||
|
🗑 Fingerprint löschen
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- Runtime-Konfiguration -->
|
<!-- Runtime-Konfiguration -->
|
||||||
<div class="settings-section">
|
<div class="settings-section">
|
||||||
<h2>Runtime-Konfiguration</h2>
|
<h2>Runtime-Konfiguration</h2>
|
||||||
@@ -1475,6 +1511,46 @@
|
|||||||
setIfPresent('diag-flux-keyword-raw', msg.fluxKeywordRaw);
|
setIfPresent('diag-flux-keyword-raw', msg.fluxKeywordRaw);
|
||||||
setIfPresent('diag-flux-keyword-switch', msg.fluxKeywordSwitch);
|
setIfPresent('diag-flux-keyword-switch', msg.fluxKeywordSwitch);
|
||||||
setIfPresent('diag-flux-hf-token', msg.huggingfaceToken);
|
setIfPresent('diag-flux-hf-token', msg.huggingfaceToken);
|
||||||
|
// Voice-ID-Threshold wiederherstellen (Default 0.50)
|
||||||
|
if (msg.voiceIdThreshold !== undefined && msg.voiceIdThreshold !== null) {
|
||||||
|
const slider = document.getElementById('diag-voice-id-threshold');
|
||||||
|
const display = document.getElementById('voice-id-threshold-display');
|
||||||
|
if (slider) slider.value = msg.voiceIdThreshold;
|
||||||
|
if (display) display.textContent = Number(msg.voiceIdThreshold).toFixed(2);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (msg.type === 'voice_id_status_response') {
|
||||||
|
const el = document.getElementById('voice-id-status');
|
||||||
|
if (!el) return;
|
||||||
|
if (msg.payload && msg.payload.ok === false) {
|
||||||
|
el.innerHTML = '<span style="color:#FF6E6E;">⚠ Whisper-Bridge nicht erreichbar: ' +
|
||||||
|
(msg.payload.error || 'unbekannt') + '</span>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const p = msg.payload || msg;
|
||||||
|
if (p.enrolled) {
|
||||||
|
const when = p.updated_at ? new Date(p.updated_at * 1000).toLocaleString('de-DE') : '?';
|
||||||
|
const totalSec = (p.sample_durations_s || []).reduce((a, b) => a + b, 0);
|
||||||
|
el.innerHTML = '<span style="color:#34C759;">✓ Enrolled</span> · ' +
|
||||||
|
p.sample_count + ' Samples (' + totalSec.toFixed(1) + 's) · ' +
|
||||||
|
'aktualisiert ' + when + ' · dim=' + (p.embedding_dim || '?');
|
||||||
|
} else {
|
||||||
|
el.innerHTML = '<span style="color:#FFD60A;">○ Nicht enrolled</span> — ' +
|
||||||
|
'in der App unter "Stimme einrichten" 5-10× je 3s aufnehmen.';
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (msg.type === 'voice_id_delete_response') {
|
||||||
|
const p = msg.payload || msg;
|
||||||
|
if (p.removed) {
|
||||||
|
alert('Fingerprint gelöscht — Voice-ID-Gating fällt zurück auf Fail-Open.');
|
||||||
|
} else {
|
||||||
|
alert('Es war kein Fingerprint vorhanden.');
|
||||||
|
}
|
||||||
|
refreshVoiceIdStatus();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2607,6 +2683,17 @@
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function refreshVoiceIdStatus() {
|
||||||
|
const el = document.getElementById('voice-id-status');
|
||||||
|
if (el) el.textContent = '⏳ Status wird abgefragt...';
|
||||||
|
send({ action: 'voice_id_status' });
|
||||||
|
}
|
||||||
|
|
||||||
|
function deleteVoiceId() {
|
||||||
|
if (!confirm('Voice-ID-Fingerprint loeschen?\n\nDanach muss in der App neu enrolled werden.')) return;
|
||||||
|
send({ action: 'voice_id_delete' });
|
||||||
|
}
|
||||||
|
|
||||||
function deleteXttsVoice(name) {
|
function deleteXttsVoice(name) {
|
||||||
if (!confirm(`Stimme "${name}" endgueltig loeschen?`)) return;
|
if (!confirm(`Stimme "${name}" endgueltig loeschen?`)) return;
|
||||||
send({ action: 'xtts_delete_voice', name });
|
send({ action: 'xtts_delete_voice', name });
|
||||||
@@ -2823,12 +2910,15 @@
|
|||||||
const fluxKeywordRaw = document.getElementById('diag-flux-keyword-raw')?.value;
|
const fluxKeywordRaw = document.getElementById('diag-flux-keyword-raw')?.value;
|
||||||
const fluxKeywordSwitch = document.getElementById('diag-flux-keyword-switch')?.value;
|
const fluxKeywordSwitch = document.getElementById('diag-flux-keyword-switch')?.value;
|
||||||
const huggingfaceToken = document.getElementById('diag-flux-hf-token')?.value;
|
const huggingfaceToken = document.getElementById('diag-flux-hf-token')?.value;
|
||||||
|
const voiceIdThresholdRaw = document.getElementById('diag-voice-id-threshold')?.value;
|
||||||
|
const voiceIdThreshold = voiceIdThresholdRaw ? parseFloat(voiceIdThresholdRaw) : undefined;
|
||||||
send({
|
send({
|
||||||
action: 'send_voice_config',
|
action: 'send_voice_config',
|
||||||
ttsEnabled, xttsVoice, whisperModel,
|
ttsEnabled, xttsVoice, whisperModel,
|
||||||
f5ttsModel, f5ttsCkptFile, f5ttsVocabFile,
|
f5ttsModel, f5ttsCkptFile, f5ttsVocabFile,
|
||||||
f5ttsCfgStrength, f5ttsNfeStep,
|
f5ttsCfgStrength, f5ttsNfeStep,
|
||||||
fluxDefaultModel, fluxKeywordRaw, fluxKeywordSwitch, huggingfaceToken,
|
fluxDefaultModel, fluxKeywordRaw, fluxKeywordSwitch, huggingfaceToken,
|
||||||
|
voiceIdThreshold,
|
||||||
});
|
});
|
||||||
const statusEl = document.getElementById('voice-status');
|
const statusEl = document.getElementById('voice-status');
|
||||||
if (statusEl && xttsVoice) {
|
if (statusEl && xttsVoice) {
|
||||||
@@ -3354,6 +3444,7 @@
|
|||||||
loadRuntimeConfig();
|
loadRuntimeConfig();
|
||||||
loadOnboardingQR();
|
loadOnboardingQR();
|
||||||
loadOAuthServices();
|
loadOAuthServices();
|
||||||
|
refreshVoiceIdStatus();
|
||||||
} else if (tab === 'brain') {
|
} else if (tab === 'brain') {
|
||||||
loadBrainStatus();
|
loadBrainStatus();
|
||||||
loadBrainMemoryList();
|
loadBrainMemoryList();
|
||||||
|
|||||||
@@ -2367,6 +2367,12 @@ wss.on("connection", (ws) => {
|
|||||||
if (msg.huggingfaceToken !== undefined) {
|
if (msg.huggingfaceToken !== undefined) {
|
||||||
voiceConfig.huggingfaceToken = String(msg.huggingfaceToken || "").trim();
|
voiceConfig.huggingfaceToken = String(msg.huggingfaceToken || "").trim();
|
||||||
}
|
}
|
||||||
|
// Voice-ID Match-Threshold (0.30-0.70). Wird von der whisper-bridge
|
||||||
|
// ueber den config-Broadcast aufgenommen — Phase 3 nutzt's beim Gating.
|
||||||
|
if (msg.voiceIdThreshold !== undefined && !isNaN(msg.voiceIdThreshold)) {
|
||||||
|
const t = parseFloat(msg.voiceIdThreshold);
|
||||||
|
if (t >= 0.0 && t <= 1.0) voiceConfig.voiceIdThreshold = t;
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
fs.mkdirSync("/shared/config", { recursive: true });
|
fs.mkdirSync("/shared/config", { recursive: true });
|
||||||
fs.writeFileSync("/shared/config/voice_config.json", JSON.stringify(voiceConfig, null, 2));
|
fs.writeFileSync("/shared/config/voice_config.json", JSON.stringify(voiceConfig, null, 2));
|
||||||
@@ -2390,6 +2396,15 @@ wss.on("connection", (ws) => {
|
|||||||
handleGetModel(ws);
|
handleGetModel(ws);
|
||||||
} else if (msg.action === "set_model") {
|
} else if (msg.action === "set_model") {
|
||||||
handleSetModel(ws, msg.model);
|
handleSetModel(ws, msg.model);
|
||||||
|
} else if (msg.action === "voice_id_status") {
|
||||||
|
// An whisper-bridge weiterleiten + Antwort an Browser zurueck
|
||||||
|
const reqId = `vid_${Date.now().toString(36)}`;
|
||||||
|
sendToRVS_withResponse("voice_id_status_request", { requestId: reqId },
|
||||||
|
"voice_id_status_response", ws);
|
||||||
|
} else if (msg.action === "voice_id_delete") {
|
||||||
|
const reqId = `viddel_${Date.now().toString(36)}`;
|
||||||
|
sendToRVS_withResponse("voice_id_delete_request", { requestId: reqId },
|
||||||
|
"voice_id_delete_response", ws);
|
||||||
}
|
}
|
||||||
// get_openclaw_config entfernt — aria-core ist raus.
|
// get_openclaw_config entfernt — aria-core ist raus.
|
||||||
} catch {}
|
} catch {}
|
||||||
|
|||||||
@@ -781,6 +781,17 @@ async def run_loop(runner: WhisperRunner, sessions: SessionManager) -> None:
|
|||||||
# Debug-Toggle: aria-bridge broadcastet jetzt whisperDebugLog
|
# Debug-Toggle: aria-bridge broadcastet jetzt whisperDebugLog
|
||||||
# damit Stefan im laufenden Betrieb via Diagnostic-Settings
|
# damit Stefan im laufenden Betrieb via Diagnostic-Settings
|
||||||
# die Logs an/aus schalten kann.
|
# die Logs an/aus schalten kann.
|
||||||
|
# Voice-ID Match-Threshold (von Diagnostic gesendet) auf das
|
||||||
|
# speaker_id-Modul setzen — wird erst in Phase 3 beim Gating
|
||||||
|
# genutzt, aber persistiert bereits jetzt.
|
||||||
|
if "voiceIdThreshold" in payload:
|
||||||
|
try:
|
||||||
|
t = float(payload.get("voiceIdThreshold", 0.5))
|
||||||
|
if 0.0 <= t <= 1.0:
|
||||||
|
speaker_id.DEFAULT_THRESHOLD = t
|
||||||
|
logger.info("[speaker-id] threshold gesetzt: %.2f", t)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
pass
|
||||||
if "whisperDebugLog" in payload:
|
if "whisperDebugLog" in payload:
|
||||||
global _DEBUG_LOG_TO_BRIDGE
|
global _DEBUG_LOG_TO_BRIDGE
|
||||||
old = _DEBUG_LOG_TO_BRIDGE
|
old = _DEBUG_LOG_TO_BRIDGE
|
||||||
|
|||||||
@@ -61,10 +61,35 @@ def _ensure_loaded():
|
|||||||
return _model
|
return _model
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_audio_bytes(audio_bytes: bytes) -> bytes:
|
||||||
|
"""Akzeptiert entweder rohes 16kHz int16 LE PCM ODER eine WAV-Datei (RIFF/WAVE).
|
||||||
|
Bei WAV wird der Header gestrippt + Format validiert (16kHz / mono / int16).
|
||||||
|
Ergebnis: rohes PCM."""
|
||||||
|
if (len(audio_bytes) >= 44
|
||||||
|
and audio_bytes[:4] == b"RIFF"
|
||||||
|
and audio_bytes[8:12] == b"WAVE"):
|
||||||
|
import io
|
||||||
|
import wave
|
||||||
|
with wave.open(io.BytesIO(audio_bytes), "rb") as wav:
|
||||||
|
sr = wav.getframerate()
|
||||||
|
ch = wav.getnchannels()
|
||||||
|
sw = wav.getsampwidth()
|
||||||
|
if sr != 16000:
|
||||||
|
raise ValueError(f"WAV-Samplerate {sr} != 16000")
|
||||||
|
if ch != 1:
|
||||||
|
raise ValueError(f"WAV-Kanalzahl {ch} != 1 (mono erwartet)")
|
||||||
|
if sw != 2:
|
||||||
|
raise ValueError(f"WAV-Sampleweite {sw} != 2 (int16 erwartet)")
|
||||||
|
return wav.readframes(wav.getnframes())
|
||||||
|
return audio_bytes
|
||||||
|
|
||||||
|
|
||||||
def _audio_bytes_to_tensor(audio_bytes: bytes):
|
def _audio_bytes_to_tensor(audio_bytes: bytes):
|
||||||
"""int16 LE PCM (16kHz mono) → Torch-Tensor (1, N), normalisiert auf [-1, 1]."""
|
"""int16 LE PCM (16kHz mono) → Torch-Tensor (1, N), normalisiert auf [-1, 1].
|
||||||
|
WAV wird vorher auf rohes PCM reduziert (Header strippen)."""
|
||||||
import torch
|
import torch
|
||||||
arr = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
raw = _normalize_audio_bytes(audio_bytes)
|
||||||
|
arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
return torch.from_numpy(arr).unsqueeze(0)
|
return torch.from_numpy(arr).unsqueeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user