fix: Speech gate - only send recording if actual speech detected
- VAD_SPEECH_THRESHOLD_DB = -35 (louder than silence threshold) - Needs 300ms of speech before counting as real speech - Recording discarded if only background noise detected - Prevents sending garbage to Whisper in conversation mode Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -42,6 +42,8 @@ const AUDIO_ENCODING = 'audio/wav';
|
|||||||
// VAD (Voice Activity Detection) — Stille-Erkennung
|
// VAD (Voice Activity Detection) — Stille-Erkennung
|
||||||
const VAD_SILENCE_THRESHOLD_DB = -45; // dB unter dem als "Stille" gilt
|
const VAD_SILENCE_THRESHOLD_DB = -45; // dB unter dem als "Stille" gilt
|
||||||
const VAD_SILENCE_DURATION_MS = 1800; // ms Stille bevor Auto-Stop
|
const VAD_SILENCE_DURATION_MS = 1800; // ms Stille bevor Auto-Stop
|
||||||
|
const VAD_SPEECH_THRESHOLD_DB = -35; // dB ueber dem als "Sprache" gilt (Sprach-Gate)
|
||||||
|
const VAD_SPEECH_MIN_MS = 300; // ms Sprache bevor Aufnahme zaehlt
|
||||||
|
|
||||||
// --- Audio-Service ---
|
// --- Audio-Service ---
|
||||||
|
|
||||||
@@ -61,6 +63,10 @@ class AudioService {
|
|||||||
private preloadedSound: Sound | null = null;
|
private preloadedSound: Sound | null = null;
|
||||||
private preloadedPath: string = '';
|
private preloadedPath: string = '';
|
||||||
|
|
||||||
|
// Sprach-Gate: Aufnahme erst senden wenn tatsaechlich gesprochen wurde
|
||||||
|
private speechDetected: boolean = false;
|
||||||
|
private speechStartTime: number = 0;
|
||||||
|
|
||||||
// VAD State
|
// VAD State
|
||||||
private vadEnabled: boolean = false;
|
private vadEnabled: boolean = false;
|
||||||
private lastSpeechTime: number = 0;
|
private lastSpeechTime: number = 0;
|
||||||
@@ -128,7 +134,21 @@ class AudioService {
|
|||||||
const db = e.currentMetering ?? -160;
|
const db = e.currentMetering ?? -160;
|
||||||
this.meterListeners.forEach(cb => cb(db));
|
this.meterListeners.forEach(cb => cb(db));
|
||||||
|
|
||||||
// VAD: Stille erkennen
|
// Sprach-Gate: Erkennen ob tatsaechlich gesprochen wird
|
||||||
|
if (db > VAD_SPEECH_THRESHOLD_DB) {
|
||||||
|
if (!this.speechDetected && this.speechStartTime === 0) {
|
||||||
|
this.speechStartTime = Date.now();
|
||||||
|
}
|
||||||
|
if (this.speechStartTime > 0 && Date.now() - this.speechStartTime >= VAD_SPEECH_MIN_MS) {
|
||||||
|
this.speechDetected = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (!this.speechDetected) {
|
||||||
|
this.speechStartTime = 0; // Reset wenn noch nicht als Sprache erkannt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// VAD: Stille erkennen (nur wenn Sprache erkannt wurde)
|
||||||
if (this.vadEnabled) {
|
if (this.vadEnabled) {
|
||||||
if (db > VAD_SILENCE_THRESHOLD_DB) {
|
if (db > VAD_SILENCE_THRESHOLD_DB) {
|
||||||
this.lastSpeechTime = Date.now();
|
this.lastSpeechTime = Date.now();
|
||||||
@@ -138,6 +158,8 @@ class AudioService {
|
|||||||
|
|
||||||
this.recordingStartTime = Date.now();
|
this.recordingStartTime = Date.now();
|
||||||
this.lastSpeechTime = Date.now();
|
this.lastSpeechTime = Date.now();
|
||||||
|
this.speechDetected = false;
|
||||||
|
this.speechStartTime = 0;
|
||||||
this.setState('recording');
|
this.setState('recording');
|
||||||
|
|
||||||
// VAD aktivieren
|
// VAD aktivieren
|
||||||
@@ -180,6 +202,15 @@ class AudioService {
|
|||||||
this.recorder.removeRecordBackListener();
|
this.recorder.removeRecordBackListener();
|
||||||
|
|
||||||
const durationMs = Date.now() - this.recordingStartTime;
|
const durationMs = Date.now() - this.recordingStartTime;
|
||||||
|
const hadSpeech = this.speechDetected;
|
||||||
|
|
||||||
|
// Sprach-Gate: Wenn keine Sprache erkannt → Aufnahme verwerfen
|
||||||
|
if (!hadSpeech) {
|
||||||
|
RNFS.unlink(this.recordingPath).catch(() => {});
|
||||||
|
this.setState('idle');
|
||||||
|
console.log('[Audio] Aufnahme verworfen — keine Sprache erkannt (nur Umgebungsgeraeusche)');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
// Audio-Datei als Base64 lesen
|
// Audio-Datei als Base64 lesen
|
||||||
const base64Data = await RNFS.readFile(this.recordingPath, 'base64');
|
const base64Data = await RNFS.readFile(this.recordingPath, 'base64');
|
||||||
@@ -188,7 +219,7 @@ class AudioService {
|
|||||||
RNFS.unlink(this.recordingPath).catch(() => {});
|
RNFS.unlink(this.recordingPath).catch(() => {});
|
||||||
|
|
||||||
this.setState('idle');
|
this.setState('idle');
|
||||||
console.log(`[Audio] Aufnahme beendet (${durationMs}ms, ${Math.round(base64Data.length / 1024)}KB)`);
|
console.log(`[Audio] Aufnahme beendet (${durationMs}ms, ${Math.round(base64Data.length / 1024)}KB, Sprache erkannt)`);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
base64: base64Data,
|
base64: base64Data,
|
||||||
|
|||||||
Reference in New Issue
Block a user