feat(speaker-id): Phase 2 — Enrollment-UI (App) + Voice-ID-Section (Diagnostic)
App-Seite: - VoiceIdEnrollment.tsx (neue Komponente, ~370 Zeilen): Status-Karte (loading/unenrolled/enrolled/error), Sample-Recorder mit Countdown (4s fest pro Sample), Liste mit einzelnem Loeschen, Save-Button (disabled bis 5 Samples), Fingerprint-Delete mit Confirm. - SettingsScreen.tsx: neue Section 🎤 'Stimme einrichten' zwischen Wake-Word und Sprachausgabe. - Sample-Format: WAV via audioService.startRecording — wird whisper-bridge-seitig per wave-Modul gestrippt. Diagnostic-Seite: - Neue settings-section 'Voice-ID (Sprecher-Erkennung)': Status-Anzeige (live ueber voice_id_status_response), Threshold-Slider 0.30-0.70 (persistiert in voice_config.json, broadcast als config-Message), Refresh + Delete-Button. - server.js: 2 neue actions (voice_id_status, voice_id_delete), send_voice_config nimmt voiceIdThreshold mit auf. Backend: - speaker_id.py: _normalize_audio_bytes erkennt jetzt WAV-Header (RIFF/WAVE) und strippt auf rohes PCM — sonst werfen die ECAPA- Embeddings auf den 44-Byte-Header rein. - bridge.py: config-Broadcast-Handler setzt voiceIdThreshold auf speaker_id.DEFAULT_THRESHOLD (wird erst in Phase 3 beim Gating genutzt, persistiert aber schon). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -764,6 +764,42 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Voice-ID (Sprecher-Erkennung) -->
|
||||
<div class="settings-section">
|
||||
<h2>Voice-ID (Sprecher-Erkennung)</h2>
|
||||
<div style="font-size:11px;color:#8888AA;margin-bottom:8px;">
|
||||
ARIA erkennt Stefans Stimme anhand eines Fingerprints (SpeechBrain ECAPA-TDNN).
|
||||
Andere Sprecher (TV, Hintergrund-Gespraeche) werden gefiltert — keine Brain-
|
||||
Calls, keine Tokens. Enrollment passiert in der App (Settings → Stimme einrichten),
|
||||
weil das Handy-Mikro auch im Betrieb hoert.
|
||||
</div>
|
||||
<div class="card" style="max-width:500px;">
|
||||
<div id="voice-id-status" style="font-size:13px;color:#E0E0F0;margin-bottom:10px;">
|
||||
Status wird geladen...
|
||||
</div>
|
||||
<div style="display:flex;align-items:center;gap:12px;margin-bottom:8px;">
|
||||
<label style="color:#8888AA;font-size:12px;min-width:130px;">Match-Threshold:</label>
|
||||
<input type="range" id="diag-voice-id-threshold" min="0.30" max="0.70" step="0.05" value="0.50"
|
||||
oninput="document.getElementById('voice-id-threshold-display').textContent = this.value"
|
||||
onchange="sendVoiceConfig()"
|
||||
style="flex:1;">
|
||||
<span id="voice-id-threshold-display" style="color:#E0E0F0;font-family:monospace;min-width:40px;text-align:right;">0.50</span>
|
||||
</div>
|
||||
<div style="font-size:10px;color:#555570;margin-bottom:12px;">
|
||||
Niedriger = mehr Treffer auch bei Nebengeraeuschen (false-positives).
|
||||
Hoeher = strenger, kann Stefan auch mal verpassen. 0.50 ist konservativer Default.
|
||||
</div>
|
||||
<div style="display:flex;gap:8px;">
|
||||
<button class="btn secondary" onclick="refreshVoiceIdStatus()" style="padding:6px 14px;font-size:12px;">
|
||||
🔄 Status aktualisieren
|
||||
</button>
|
||||
<button class="btn danger" onclick="deleteVoiceId()" style="padding:6px 14px;font-size:12px;">
|
||||
🗑 Fingerprint löschen
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Runtime-Konfiguration -->
|
||||
<div class="settings-section">
|
||||
<h2>Runtime-Konfiguration</h2>
|
||||
@@ -1475,6 +1511,46 @@
|
||||
setIfPresent('diag-flux-keyword-raw', msg.fluxKeywordRaw);
|
||||
setIfPresent('diag-flux-keyword-switch', msg.fluxKeywordSwitch);
|
||||
setIfPresent('diag-flux-hf-token', msg.huggingfaceToken);
|
||||
// Voice-ID-Threshold wiederherstellen (Default 0.50)
|
||||
if (msg.voiceIdThreshold !== undefined && msg.voiceIdThreshold !== null) {
|
||||
const slider = document.getElementById('diag-voice-id-threshold');
|
||||
const display = document.getElementById('voice-id-threshold-display');
|
||||
if (slider) slider.value = msg.voiceIdThreshold;
|
||||
if (display) display.textContent = Number(msg.voiceIdThreshold).toFixed(2);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (msg.type === 'voice_id_status_response') {
|
||||
const el = document.getElementById('voice-id-status');
|
||||
if (!el) return;
|
||||
if (msg.payload && msg.payload.ok === false) {
|
||||
el.innerHTML = '<span style="color:#FF6E6E;">⚠ Whisper-Bridge nicht erreichbar: ' +
|
||||
(msg.payload.error || 'unbekannt') + '</span>';
|
||||
return;
|
||||
}
|
||||
const p = msg.payload || msg;
|
||||
if (p.enrolled) {
|
||||
const when = p.updated_at ? new Date(p.updated_at * 1000).toLocaleString('de-DE') : '?';
|
||||
const totalSec = (p.sample_durations_s || []).reduce((a, b) => a + b, 0);
|
||||
el.innerHTML = '<span style="color:#34C759;">✓ Enrolled</span> · ' +
|
||||
p.sample_count + ' Samples (' + totalSec.toFixed(1) + 's) · ' +
|
||||
'aktualisiert ' + when + ' · dim=' + (p.embedding_dim || '?');
|
||||
} else {
|
||||
el.innerHTML = '<span style="color:#FFD60A;">○ Nicht enrolled</span> — ' +
|
||||
'in der App unter "Stimme einrichten" 5-10× je 3s aufnehmen.';
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (msg.type === 'voice_id_delete_response') {
|
||||
const p = msg.payload || msg;
|
||||
if (p.removed) {
|
||||
alert('Fingerprint gelöscht — Voice-ID-Gating fällt zurück auf Fail-Open.');
|
||||
} else {
|
||||
alert('Es war kein Fingerprint vorhanden.');
|
||||
}
|
||||
refreshVoiceIdStatus();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -2607,6 +2683,17 @@
|
||||
});
|
||||
}
|
||||
|
||||
function refreshVoiceIdStatus() {
|
||||
const el = document.getElementById('voice-id-status');
|
||||
if (el) el.textContent = '⏳ Status wird abgefragt...';
|
||||
send({ action: 'voice_id_status' });
|
||||
}
|
||||
|
||||
function deleteVoiceId() {
|
||||
if (!confirm('Voice-ID-Fingerprint loeschen?\n\nDanach muss in der App neu enrolled werden.')) return;
|
||||
send({ action: 'voice_id_delete' });
|
||||
}
|
||||
|
||||
function deleteXttsVoice(name) {
|
||||
if (!confirm(`Stimme "${name}" endgueltig loeschen?`)) return;
|
||||
send({ action: 'xtts_delete_voice', name });
|
||||
@@ -2823,12 +2910,15 @@
|
||||
const fluxKeywordRaw = document.getElementById('diag-flux-keyword-raw')?.value;
|
||||
const fluxKeywordSwitch = document.getElementById('diag-flux-keyword-switch')?.value;
|
||||
const huggingfaceToken = document.getElementById('diag-flux-hf-token')?.value;
|
||||
const voiceIdThresholdRaw = document.getElementById('diag-voice-id-threshold')?.value;
|
||||
const voiceIdThreshold = voiceIdThresholdRaw ? parseFloat(voiceIdThresholdRaw) : undefined;
|
||||
send({
|
||||
action: 'send_voice_config',
|
||||
ttsEnabled, xttsVoice, whisperModel,
|
||||
f5ttsModel, f5ttsCkptFile, f5ttsVocabFile,
|
||||
f5ttsCfgStrength, f5ttsNfeStep,
|
||||
fluxDefaultModel, fluxKeywordRaw, fluxKeywordSwitch, huggingfaceToken,
|
||||
voiceIdThreshold,
|
||||
});
|
||||
const statusEl = document.getElementById('voice-status');
|
||||
if (statusEl && xttsVoice) {
|
||||
@@ -3354,6 +3444,7 @@
|
||||
loadRuntimeConfig();
|
||||
loadOnboardingQR();
|
||||
loadOAuthServices();
|
||||
refreshVoiceIdStatus();
|
||||
} else if (tab === 'brain') {
|
||||
loadBrainStatus();
|
||||
loadBrainMemoryList();
|
||||
|
||||
@@ -2367,6 +2367,12 @@ wss.on("connection", (ws) => {
|
||||
if (msg.huggingfaceToken !== undefined) {
|
||||
voiceConfig.huggingfaceToken = String(msg.huggingfaceToken || "").trim();
|
||||
}
|
||||
// Voice-ID Match-Threshold (0.30-0.70). Wird von der whisper-bridge
|
||||
// ueber den config-Broadcast aufgenommen — Phase 3 nutzt's beim Gating.
|
||||
if (msg.voiceIdThreshold !== undefined && !isNaN(msg.voiceIdThreshold)) {
|
||||
const t = parseFloat(msg.voiceIdThreshold);
|
||||
if (t >= 0.0 && t <= 1.0) voiceConfig.voiceIdThreshold = t;
|
||||
}
|
||||
try {
|
||||
fs.mkdirSync("/shared/config", { recursive: true });
|
||||
fs.writeFileSync("/shared/config/voice_config.json", JSON.stringify(voiceConfig, null, 2));
|
||||
@@ -2390,6 +2396,15 @@ wss.on("connection", (ws) => {
|
||||
handleGetModel(ws);
|
||||
} else if (msg.action === "set_model") {
|
||||
handleSetModel(ws, msg.model);
|
||||
} else if (msg.action === "voice_id_status") {
|
||||
// An whisper-bridge weiterleiten + Antwort an Browser zurueck
|
||||
const reqId = `vid_${Date.now().toString(36)}`;
|
||||
sendToRVS_withResponse("voice_id_status_request", { requestId: reqId },
|
||||
"voice_id_status_response", ws);
|
||||
} else if (msg.action === "voice_id_delete") {
|
||||
const reqId = `viddel_${Date.now().toString(36)}`;
|
||||
sendToRVS_withResponse("voice_id_delete_request", { requestId: reqId },
|
||||
"voice_id_delete_response", ws);
|
||||
}
|
||||
// get_openclaw_config entfernt — aria-core ist raus.
|
||||
} catch {}
|
||||
|
||||
Reference in New Issue
Block a user