fix: Speech gate - only send recording if actual speech detected

- VAD_SPEECH_THRESHOLD_DB = -35 (louder than silence threshold) - Needs 300ms of speech before counting as real speech - Recording discarded if only background noise detected - Prevents sending garbage to Whisper in conversation mode Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 18:20:05 +02:00 · 2026-04-11 18:20:05 +02:00 · acc13aef6b
parent 4bbc6f7787
commit acc13aef6b
1 changed files with 33 additions and 2 deletions
--- a/android/src/services/audio.ts
+++ b/android/src/services/audio.ts
@ -42,6 +42,8 @@ const AUDIO_ENCODING = 'audio/wav';
 // VAD (Voice Activity Detection) — Stille-Erkennung
 const VAD_SILENCE_THRESHOLD_DB = -45;  // dB unter dem als "Stille" gilt
 const VAD_SILENCE_DURATION_MS = 1800;  // ms Stille bevor Auto-Stop
+const VAD_SPEECH_THRESHOLD_DB = -35;   // dB ueber dem als "Sprache" gilt (Sprach-Gate)
+const VAD_SPEECH_MIN_MS = 300;         // ms Sprache bevor Aufnahme zaehlt

 // --- Audio-Service ---

@ -61,6 +63,10 @@ class AudioService {
  private preloadedSound: Sound | null = null;
  private preloadedPath: string = '';

+  // Sprach-Gate: Aufnahme erst senden wenn tatsaechlich gesprochen wurde
+  private speechDetected: boolean = false;
+  private speechStartTime: number = 0;
+
  // VAD State
  private vadEnabled: boolean = false;
  private lastSpeechTime: number = 0;
@ -128,7 +134,21 @@ class AudioService {
        const db = e.currentMetering ?? -160;
        this.meterListeners.forEach(cb => cb(db));

-        // VAD: Stille erkennen
+        // Sprach-Gate: Erkennen ob tatsaechlich gesprochen wird
+        if (db > VAD_SPEECH_THRESHOLD_DB) {
+          if (!this.speechDetected && this.speechStartTime === 0) {
+            this.speechStartTime = Date.now();
+          }
+          if (this.speechStartTime > 0 && Date.now() - this.speechStartTime >= VAD_SPEECH_MIN_MS) {
+            this.speechDetected = true;
+          }
+        } else {
+          if (!this.speechDetected) {
+            this.speechStartTime = 0; // Reset wenn noch nicht als Sprache erkannt
+          }
+        }
+
+        // VAD: Stille erkennen (nur wenn Sprache erkannt wurde)
        if (this.vadEnabled) {
          if (db > VAD_SILENCE_THRESHOLD_DB) {
            this.lastSpeechTime = Date.now();
@ -138,6 +158,8 @@ class AudioService {

      this.recordingStartTime = Date.now();
      this.lastSpeechTime = Date.now();
+      this.speechDetected = false;
+      this.speechStartTime = 0;
      this.setState('recording');

      // VAD aktivieren
@ -180,6 +202,15 @@ class AudioService {
      this.recorder.removeRecordBackListener();

      const durationMs = Date.now() - this.recordingStartTime;
+      const hadSpeech = this.speechDetected;
+
+      // Sprach-Gate: Wenn keine Sprache erkannt → Aufnahme verwerfen
+      if (!hadSpeech) {
+        RNFS.unlink(this.recordingPath).catch(() => {});
+        this.setState('idle');
+        console.log('[Audio] Aufnahme verworfen — keine Sprache erkannt (nur Umgebungsgeraeusche)');
+        return null;
+      }

      // Audio-Datei als Base64 lesen
      const base64Data = await RNFS.readFile(this.recordingPath, 'base64');
@ -188,7 +219,7 @@ class AudioService {
      RNFS.unlink(this.recordingPath).catch(() => {});

      this.setState('idle');
-      console.log(`[Audio] Aufnahme beendet (${durationMs}ms, ${Math.round(base64Data.length / 1024)}KB)`);
+      console.log(`[Audio] Aufnahme beendet (${durationMs}ms, ${Math.round(base64Data.length / 1024)}KB, Sprache erkannt)`);

      return {
        base64: base64Data,