From e3fe27f73677ad6b025be88496a87e124cccbdc1 Mon Sep 17 00:00:00 2001 From: duffyduck Date: Sat, 6 Jun 2026 20:36:06 +0200 Subject: [PATCH] =?UTF-8?q?feat(speaker-id):=20Phase=202=20=E2=80=94=20Enr?= =?UTF-8?q?ollment-UI=20(App)=20+=20Voice-ID-Section=20(Diagnostic)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit App-Seite: - VoiceIdEnrollment.tsx (neue Komponente, ~370 Zeilen): Status-Karte (loading/unenrolled/enrolled/error), Sample-Recorder mit Countdown (4s fest pro Sample), Liste mit einzelnem Loeschen, Save-Button (disabled bis 5 Samples), Fingerprint-Delete mit Confirm. - SettingsScreen.tsx: neue Section 🎤 'Stimme einrichten' zwischen Wake-Word und Sprachausgabe. - Sample-Format: WAV via audioService.startRecording — wird whisper-bridge-seitig per wave-Modul gestrippt. Diagnostic-Seite: - Neue settings-section 'Voice-ID (Sprecher-Erkennung)': Status-Anzeige (live ueber voice_id_status_response), Threshold-Slider 0.30-0.70 (persistiert in voice_config.json, broadcast als config-Message), Refresh + Delete-Button. - server.js: 2 neue actions (voice_id_status, voice_id_delete), send_voice_config nimmt voiceIdThreshold mit auf. Backend: - speaker_id.py: _normalize_audio_bytes erkennt jetzt WAV-Header (RIFF/WAVE) und strippt auf rohes PCM — sonst werfen die ECAPA- Embeddings auf den 44-Byte-Header rein. - bridge.py: config-Broadcast-Handler setzt voiceIdThreshold auf speaker_id.DEFAULT_THRESHOLD (wird erst in Phase 3 beim Gating genutzt, persistiert aber schon). Co-Authored-By: Claude Opus 4.7 --- android/src/components/VoiceIdEnrollment.tsx | 426 +++++++++++++++++++ android/src/screens/SettingsScreen.tsx | 8 + diagnostic/index.html | 91 ++++ diagnostic/server.js | 15 + xtts/whisper/bridge.py | 11 + xtts/whisper/speaker_id.py | 29 +- 6 files changed, 578 insertions(+), 2 deletions(-) create mode 100644 android/src/components/VoiceIdEnrollment.tsx diff --git a/android/src/components/VoiceIdEnrollment.tsx b/android/src/components/VoiceIdEnrollment.tsx new file mode 100644 index 0000000..bb5d33f --- /dev/null +++ b/android/src/components/VoiceIdEnrollment.tsx @@ -0,0 +1,426 @@ +/** + * Voice-ID Enrollment + Status — App-seitig. + * + * User nimmt 5-7 Samples (je 4s) seiner Stimme auf, App schickt sie an + * die whisper-bridge via RVS (voice_id_enroll_request). Bridge berechnet + * SpeechBrain-ECAPA-Embeddings, mittelt sie zu einem Fingerprint, speichert + * /voice-id/fingerprint.json. + * + * Verwendung: in SettingsScreen für Section 'voice_id' eingebunden. + * Holt Status bei Mount + nach jedem Enroll/Delete neu ab. + */ + +import React, { useCallback, useEffect, useState } from 'react'; +import { + ActivityIndicator, + Alert, + ScrollView, + StyleSheet, + Text, + ToastAndroid, + TouchableOpacity, + View, +} from 'react-native'; + +import audioService from '../services/audio'; +import rvs from '../services/rvs'; + +const SAMPLE_DURATION_MS = 4000; // Pro Sample 4s aufnehmen +const SAMPLES_REQUIRED = 5; // Mindest-Sampleanzahl fuer Save + +type Sample = { + base64: string; + durationMs: number; +}; + +type Status = + | { state: 'loading' } + | { state: 'unenrolled' } + | { state: 'enrolled'; sampleCount: number; durations: number[]; updatedAt: number; dim: number } + | { state: 'error'; message: string }; + +function _newReqId(prefix: string): string { + return `${prefix}_${Date.now().toString(36)}_${Math.floor(Math.random() * 1e6).toString(36)}`; +} + +export const VoiceIdEnrollment: React.FC = () => { + const [status, setStatus] = useState({ state: 'loading' }); + const [samples, setSamples] = useState([]); + const [recording, setRecording] = useState(false); + const [recordCountdown, setRecordCountdown] = useState(0); + const [enrollPending, setEnrollPending] = useState(false); + const [pendingReqId, setPendingReqId] = useState(null); + + // Status laden + const refreshStatus = useCallback(() => { + setStatus({ state: 'loading' }); + const reqId = _newReqId('vid'); + setPendingReqId(reqId); + rvs.send('voice_id_status_request' as any, { requestId: reqId }); + }, []); + + useEffect(() => { + refreshStatus(); + }, [refreshStatus]); + + // RVS-Antworten verarbeiten + useEffect(() => { + const unsub = rvs.onMessage((msg: any) => { + if (!msg) return; + const p = msg.payload || {}; + if (msg.type === 'voice_id_status_response') { + if (p.ok === false) { + setStatus({ state: 'error', message: p.error || 'Whisper-Bridge nicht erreichbar' }); + return; + } + if (p.enrolled) { + setStatus({ + state: 'enrolled', + sampleCount: p.sample_count || 0, + durations: p.sample_durations_s || [], + updatedAt: p.updated_at || 0, + dim: p.embedding_dim || 0, + }); + } else { + setStatus({ state: 'unenrolled' }); + } + } else if (msg.type === 'voice_id_enroll_response') { + setEnrollPending(false); + if (p.ok === false) { + Alert.alert('Enrollment fehlgeschlagen', p.error || 'Unbekannter Fehler'); + return; + } + const rejected = (p.rejected || []).length; + ToastAndroid.show( + `✓ Stimme gespeichert (${p.sample_count} Samples${rejected ? `, ${rejected} verworfen` : ''})`, + ToastAndroid.LONG, + ); + setSamples([]); + refreshStatus(); + } else if (msg.type === 'voice_id_delete_response') { + ToastAndroid.show(p.removed ? '✓ Stimme gelöscht' : 'Es war keine gespeichert', ToastAndroid.SHORT); + refreshStatus(); + } + }); + return () => unsub(); + }, [refreshStatus]); + + // Ein Sample aufnehmen — fest 4s, dann auto-stop + const recordSample = useCallback(async () => { + if (recording || enrollPending) return; + setRecording(true); + setRecordCountdown(SAMPLE_DURATION_MS / 1000); + try { + const ok = await audioService.startRecording(false); + if (!ok) { + ToastAndroid.show('Aufnahme konnte nicht gestartet werden', ToastAndroid.LONG); + setRecording(false); + setRecordCountdown(0); + return; + } + // Countdown-Timer (rein UI) + const tickInterval = setInterval(() => { + setRecordCountdown(c => Math.max(0, c - 1)); + }, 1000); + // Auto-Stop nach festen 4s + await new Promise(r => setTimeout(r, SAMPLE_DURATION_MS)); + clearInterval(tickInterval); + const result = await audioService.stopRecording(); + setRecordCountdown(0); + setRecording(false); + if (!result || !result.base64) { + ToastAndroid.show('Aufnahme leer — nochmal probieren', ToastAndroid.LONG); + return; + } + setSamples(prev => [...prev, { base64: result.base64, durationMs: result.durationMs }]); + } catch (err: any) { + console.warn('[VoiceId] recordSample:', err); + try { await audioService.cancelRecording(); } catch {} + setRecording(false); + setRecordCountdown(0); + ToastAndroid.show('Aufnahmefehler: ' + (err?.message || err), ToastAndroid.LONG); + } + }, [recording, enrollPending]); + + const removeSample = useCallback((idx: number) => { + setSamples(prev => prev.filter((_, i) => i !== idx)); + }, []); + + const sendEnrollment = useCallback(() => { + if (samples.length < SAMPLES_REQUIRED) { + Alert.alert('Noch nicht genug', + `Bitte mindestens ${SAMPLES_REQUIRED} Samples aufnehmen — aktuell ${samples.length}.`); + return; + } + if (enrollPending) return; + setEnrollPending(true); + const reqId = _newReqId('videnroll'); + rvs.send('voice_id_enroll_request' as any, { + requestId: reqId, + samples: samples.map(s => s.base64), + }); + // Sicherheits-Timeout: wenn nach 60s nichts kommt, freigeben + setTimeout(() => { + setEnrollPending(prev => { + if (prev) { + ToastAndroid.show('Enrollment-Timeout — bitte erneut versuchen', ToastAndroid.LONG); + } + return false; + }); + }, 60_000); + }, [samples, enrollPending]); + + const deleteFingerprint = useCallback(() => { + Alert.alert( + 'Stimme löschen?', + 'Danach muss ARIA neu enrolled werden, sonst greift Speaker-ID-Filter nicht.', + [ + { text: 'Abbrechen', style: 'cancel' }, + { + text: 'Löschen', style: 'destructive', onPress: () => { + const reqId = _newReqId('viddel'); + rvs.send('voice_id_delete_request' as any, { requestId: reqId }); + }, + }, + ], + ); + }, []); + + // ── Render ────────────────────────────────────────────── + + return ( + + + ARIA erkennt deine Stimme an einem Fingerprint (SpeechBrain ECAPA-TDNN, 192 Dimensionen). + Andere Sprecher (TV, Hintergrund, andere Personen) werden gefiltert — keine Brain-Calls, + keine Tokens. {'\n\n'} + Sprich {SAMPLES_REQUIRED} Mal je {SAMPLE_DURATION_MS / 1000}s ganz normal — verschiedene + Sätze, ruhige Umgebung empfohlen. + + + {/* Status-Karte */} + + Status + {status.state === 'loading' && ( + + + Wird abgefragt... + + )} + {status.state === 'unenrolled' && ( + ○ Nicht enrolled — Stimme einrichten ↓ + )} + {status.state === 'enrolled' && ( + <> + + ✓ Enrolled — {status.sampleCount} Samples + ({status.durations.reduce((a, b) => a + b, 0).toFixed(1)}s gesamt) + + + Aktualisiert {new Date(status.updatedAt * 1000).toLocaleString('de-DE')} · dim={status.dim} + + + )} + {status.state === 'error' && ( + ⚠ {status.message} + )} + + + {/* Aufnahme-Bereich */} + + Samples ({samples.length}/{SAMPLES_REQUIRED}) + {samples.length === 0 && !recording && ( + Tipp: sprich klare normale Sätze, je 3-4 Sekunden Audio. + )} + {samples.map((sample, idx) => ( + + + Sample {idx + 1} · {(sample.durationMs / 1000).toFixed(1)}s + + removeSample(idx)} disabled={enrollPending}> + + + + ))} + + + {recording ? ( + <> + + Aufnahme läuft… {recordCountdown}s + + ) : ( + ⏺ Sample {samples.length + 1} aufnehmen + )} + + + {samples.length > 0 && !recording && ( + setSamples([])} + disabled={enrollPending} + style={s.resetBtn} + > + Alle verwerfen + + )} + + + {/* Aktionen */} + + + {enrollPending ? ( + <> + + Wird verarbeitet… + + ) : ( + + ✓ Speichern ({samples.length}/{SAMPLES_REQUIRED}) + + )} + + + + {/* Verwaltung */} + {status.state === 'enrolled' && ( + + Verwaltung + + 🔄 Status aktualisieren + + + 🗑 Fingerprint löschen (Re-Enrollment nötig) + + + )} + + ); +}; + +const s = StyleSheet.create({ + intro: { + color: '#8888AA', + fontSize: 13, + lineHeight: 19, + marginBottom: 16, + paddingHorizontal: 4, + }, + card: { + backgroundColor: 'rgba(30,30,46,0.6)', + borderRadius: 8, + padding: 14, + marginBottom: 10, + }, + cardLabel: { + color: '#8888AA', + fontSize: 11, + fontWeight: '700', + textTransform: 'uppercase', + letterSpacing: 0.5, + marginBottom: 8, + }, + statusText: { + color: '#E0E0F0', + fontSize: 14, + fontWeight: '600', + }, + statusSub: { + color: '#555570', + fontSize: 11, + marginTop: 4, + }, + hint: { + color: '#555570', + fontSize: 12, + fontStyle: 'italic', + marginBottom: 8, + }, + sampleRow: { + flexDirection: 'row', + justifyContent: 'space-between', + alignItems: 'center', + paddingVertical: 6, + borderBottomWidth: 1, + borderColor: '#2A2A3E', + }, + sampleText: { + color: '#E0E0F0', + fontSize: 13, + }, + recordBtn: { + flexDirection: 'row', + alignItems: 'center', + justifyContent: 'center', + gap: 8, + backgroundColor: '#E55C5C', + borderRadius: 8, + paddingVertical: 14, + marginTop: 12, + }, + recordBtnText: { + color: '#fff', + fontSize: 15, + fontWeight: '700', + }, + resetBtn: { + alignItems: 'center', + paddingVertical: 8, + marginTop: 6, + }, + resetBtnText: { + color: '#FFD60A', + fontSize: 12, + }, + primaryBtn: { + flex: 1, + flexDirection: 'row', + alignItems: 'center', + justifyContent: 'center', + gap: 8, + backgroundColor: '#34C759', + borderRadius: 8, + paddingVertical: 14, + }, + primaryBtnText: { + color: '#fff', + fontSize: 15, + fontWeight: '700', + }, + secondaryBtn: { + backgroundColor: 'rgba(0,150,255,0.15)', + borderRadius: 6, + paddingVertical: 10, + alignItems: 'center', + marginTop: 6, + }, + secondaryBtnText: { + color: '#0096FF', + fontSize: 13, + fontWeight: '600', + }, + dangerBtn: { + backgroundColor: 'rgba(229,92,92,0.15)', + borderRadius: 6, + paddingVertical: 10, + alignItems: 'center', + marginTop: 6, + }, + dangerBtnText: { + color: '#E55C5C', + fontSize: 13, + fontWeight: '600', + }, +}); + +export default VoiceIdEnrollment; diff --git a/android/src/screens/SettingsScreen.tsx b/android/src/screens/SettingsScreen.tsx index 62e2731..8d354e8 100644 --- a/android/src/screens/SettingsScreen.tsx +++ b/android/src/screens/SettingsScreen.tsx @@ -91,6 +91,7 @@ import MemoryBrowser from '../components/MemoryBrowser'; import TriggerBrowser from '../components/TriggerBrowser'; import SkillBrowser from '../components/SkillBrowser'; import OAuthBrowser from '../components/OAuthBrowser'; +import VoiceIdEnrollment from '../components/VoiceIdEnrollment'; import { isVerboseLogging, setVerboseLogging, isDebugLogsToBridge, setDebugLogsToBridge, APP_LOG_EVENT } from '../services/logger'; import { isWakeReadySoundEnabled, @@ -136,6 +137,7 @@ const SETTINGS_SECTIONS = [ { id: 'general', icon: '⚙️', label: 'Allgemein', desc: 'Betriebsmodus, GPS-Standort' }, { id: 'voice_input', icon: '🎙️', label: 'Spracheingabe', desc: 'Stille-Toleranz, Aufnahmedauer' }, { id: 'wake_word', icon: '👂', label: 'Wake-Word', desc: 'Wake-Word-Auswahl' }, + { id: 'voice_id', icon: '🎤', label: 'Stimme einrichten', desc: 'Sprecher-Erkennung — nur deine Stimme triggert ARIA' }, { id: 'voice_output', icon: '🔊', label: 'Sprachausgabe', desc: 'Stimmen, Pre-Roll, Geschwindigkeit' }, { id: 'storage', icon: '📁', label: 'Speicher', desc: 'Anhang-Speicherort, Auto-Download' }, { id: 'files', icon: '📂', label: 'Dateien', desc: 'ARIA- und User-Dateien — anzeigen, löschen' }, @@ -1836,6 +1838,12 @@ const SettingsScreen: React.FC = () => { )} + {/* === Voice-ID Enrollment (Sprecher-Erkennung) === */} + {currentSection === 'voice_id' && (<> + Stimme einrichten + + )} + {/* === Sprachausgabe (geraetelokal) === */} {currentSection === 'voice_output' && (<> Sprachausgabe diff --git a/diagnostic/index.html b/diagnostic/index.html index 95078a0..3d91a89 100644 --- a/diagnostic/index.html +++ b/diagnostic/index.html @@ -764,6 +764,42 @@ + +
+

Voice-ID (Sprecher-Erkennung)

+
+ ARIA erkennt Stefans Stimme anhand eines Fingerprints (SpeechBrain ECAPA-TDNN). + Andere Sprecher (TV, Hintergrund-Gespraeche) werden gefiltert — keine Brain- + Calls, keine Tokens. Enrollment passiert in der App (Settings → Stimme einrichten), + weil das Handy-Mikro auch im Betrieb hoert. +
+
+
+ Status wird geladen... +
+
+ + + 0.50 +
+
+ Niedriger = mehr Treffer auch bei Nebengeraeuschen (false-positives). + Hoeher = strenger, kann Stefan auch mal verpassen. 0.50 ist konservativer Default. +
+
+ + +
+
+
+

Runtime-Konfiguration

@@ -1475,6 +1511,46 @@ setIfPresent('diag-flux-keyword-raw', msg.fluxKeywordRaw); setIfPresent('diag-flux-keyword-switch', msg.fluxKeywordSwitch); setIfPresent('diag-flux-hf-token', msg.huggingfaceToken); + // Voice-ID-Threshold wiederherstellen (Default 0.50) + if (msg.voiceIdThreshold !== undefined && msg.voiceIdThreshold !== null) { + const slider = document.getElementById('diag-voice-id-threshold'); + const display = document.getElementById('voice-id-threshold-display'); + if (slider) slider.value = msg.voiceIdThreshold; + if (display) display.textContent = Number(msg.voiceIdThreshold).toFixed(2); + } + return; + } + + if (msg.type === 'voice_id_status_response') { + const el = document.getElementById('voice-id-status'); + if (!el) return; + if (msg.payload && msg.payload.ok === false) { + el.innerHTML = '⚠ Whisper-Bridge nicht erreichbar: ' + + (msg.payload.error || 'unbekannt') + ''; + return; + } + const p = msg.payload || msg; + if (p.enrolled) { + const when = p.updated_at ? new Date(p.updated_at * 1000).toLocaleString('de-DE') : '?'; + const totalSec = (p.sample_durations_s || []).reduce((a, b) => a + b, 0); + el.innerHTML = '✓ Enrolled · ' + + p.sample_count + ' Samples (' + totalSec.toFixed(1) + 's) · ' + + 'aktualisiert ' + when + ' · dim=' + (p.embedding_dim || '?'); + } else { + el.innerHTML = '○ Nicht enrolled — ' + + 'in der App unter "Stimme einrichten" 5-10× je 3s aufnehmen.'; + } + return; + } + + if (msg.type === 'voice_id_delete_response') { + const p = msg.payload || msg; + if (p.removed) { + alert('Fingerprint gelöscht — Voice-ID-Gating fällt zurück auf Fail-Open.'); + } else { + alert('Es war kein Fingerprint vorhanden.'); + } + refreshVoiceIdStatus(); return; } @@ -2607,6 +2683,17 @@ }); } + function refreshVoiceIdStatus() { + const el = document.getElementById('voice-id-status'); + if (el) el.textContent = '⏳ Status wird abgefragt...'; + send({ action: 'voice_id_status' }); + } + + function deleteVoiceId() { + if (!confirm('Voice-ID-Fingerprint loeschen?\n\nDanach muss in der App neu enrolled werden.')) return; + send({ action: 'voice_id_delete' }); + } + function deleteXttsVoice(name) { if (!confirm(`Stimme "${name}" endgueltig loeschen?`)) return; send({ action: 'xtts_delete_voice', name }); @@ -2823,12 +2910,15 @@ const fluxKeywordRaw = document.getElementById('diag-flux-keyword-raw')?.value; const fluxKeywordSwitch = document.getElementById('diag-flux-keyword-switch')?.value; const huggingfaceToken = document.getElementById('diag-flux-hf-token')?.value; + const voiceIdThresholdRaw = document.getElementById('diag-voice-id-threshold')?.value; + const voiceIdThreshold = voiceIdThresholdRaw ? parseFloat(voiceIdThresholdRaw) : undefined; send({ action: 'send_voice_config', ttsEnabled, xttsVoice, whisperModel, f5ttsModel, f5ttsCkptFile, f5ttsVocabFile, f5ttsCfgStrength, f5ttsNfeStep, fluxDefaultModel, fluxKeywordRaw, fluxKeywordSwitch, huggingfaceToken, + voiceIdThreshold, }); const statusEl = document.getElementById('voice-status'); if (statusEl && xttsVoice) { @@ -3354,6 +3444,7 @@ loadRuntimeConfig(); loadOnboardingQR(); loadOAuthServices(); + refreshVoiceIdStatus(); } else if (tab === 'brain') { loadBrainStatus(); loadBrainMemoryList(); diff --git a/diagnostic/server.js b/diagnostic/server.js index 1a62eb4..0885759 100644 --- a/diagnostic/server.js +++ b/diagnostic/server.js @@ -2367,6 +2367,12 @@ wss.on("connection", (ws) => { if (msg.huggingfaceToken !== undefined) { voiceConfig.huggingfaceToken = String(msg.huggingfaceToken || "").trim(); } + // Voice-ID Match-Threshold (0.30-0.70). Wird von der whisper-bridge + // ueber den config-Broadcast aufgenommen — Phase 3 nutzt's beim Gating. + if (msg.voiceIdThreshold !== undefined && !isNaN(msg.voiceIdThreshold)) { + const t = parseFloat(msg.voiceIdThreshold); + if (t >= 0.0 && t <= 1.0) voiceConfig.voiceIdThreshold = t; + } try { fs.mkdirSync("/shared/config", { recursive: true }); fs.writeFileSync("/shared/config/voice_config.json", JSON.stringify(voiceConfig, null, 2)); @@ -2390,6 +2396,15 @@ wss.on("connection", (ws) => { handleGetModel(ws); } else if (msg.action === "set_model") { handleSetModel(ws, msg.model); + } else if (msg.action === "voice_id_status") { + // An whisper-bridge weiterleiten + Antwort an Browser zurueck + const reqId = `vid_${Date.now().toString(36)}`; + sendToRVS_withResponse("voice_id_status_request", { requestId: reqId }, + "voice_id_status_response", ws); + } else if (msg.action === "voice_id_delete") { + const reqId = `viddel_${Date.now().toString(36)}`; + sendToRVS_withResponse("voice_id_delete_request", { requestId: reqId }, + "voice_id_delete_response", ws); } // get_openclaw_config entfernt — aria-core ist raus. } catch {} diff --git a/xtts/whisper/bridge.py b/xtts/whisper/bridge.py index e79e281..d1be158 100644 --- a/xtts/whisper/bridge.py +++ b/xtts/whisper/bridge.py @@ -781,6 +781,17 @@ async def run_loop(runner: WhisperRunner, sessions: SessionManager) -> None: # Debug-Toggle: aria-bridge broadcastet jetzt whisperDebugLog # damit Stefan im laufenden Betrieb via Diagnostic-Settings # die Logs an/aus schalten kann. + # Voice-ID Match-Threshold (von Diagnostic gesendet) auf das + # speaker_id-Modul setzen — wird erst in Phase 3 beim Gating + # genutzt, aber persistiert bereits jetzt. + if "voiceIdThreshold" in payload: + try: + t = float(payload.get("voiceIdThreshold", 0.5)) + if 0.0 <= t <= 1.0: + speaker_id.DEFAULT_THRESHOLD = t + logger.info("[speaker-id] threshold gesetzt: %.2f", t) + except (TypeError, ValueError): + pass if "whisperDebugLog" in payload: global _DEBUG_LOG_TO_BRIDGE old = _DEBUG_LOG_TO_BRIDGE diff --git a/xtts/whisper/speaker_id.py b/xtts/whisper/speaker_id.py index 571ad7e..4799599 100644 --- a/xtts/whisper/speaker_id.py +++ b/xtts/whisper/speaker_id.py @@ -61,10 +61,35 @@ def _ensure_loaded(): return _model +def _normalize_audio_bytes(audio_bytes: bytes) -> bytes: + """Akzeptiert entweder rohes 16kHz int16 LE PCM ODER eine WAV-Datei (RIFF/WAVE). + Bei WAV wird der Header gestrippt + Format validiert (16kHz / mono / int16). + Ergebnis: rohes PCM.""" + if (len(audio_bytes) >= 44 + and audio_bytes[:4] == b"RIFF" + and audio_bytes[8:12] == b"WAVE"): + import io + import wave + with wave.open(io.BytesIO(audio_bytes), "rb") as wav: + sr = wav.getframerate() + ch = wav.getnchannels() + sw = wav.getsampwidth() + if sr != 16000: + raise ValueError(f"WAV-Samplerate {sr} != 16000") + if ch != 1: + raise ValueError(f"WAV-Kanalzahl {ch} != 1 (mono erwartet)") + if sw != 2: + raise ValueError(f"WAV-Sampleweite {sw} != 2 (int16 erwartet)") + return wav.readframes(wav.getnframes()) + return audio_bytes + + def _audio_bytes_to_tensor(audio_bytes: bytes): - """int16 LE PCM (16kHz mono) → Torch-Tensor (1, N), normalisiert auf [-1, 1].""" + """int16 LE PCM (16kHz mono) → Torch-Tensor (1, N), normalisiert auf [-1, 1]. + WAV wird vorher auf rohes PCM reduziert (Header strippen).""" import torch - arr = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + raw = _normalize_audio_bytes(audio_bytes) + arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0 return torch.from_numpy(arr).unsqueeze(0)