ARIA-AGENT/android/src/services/audio.ts

/**
 * Audio-Service fuer Sprach-Ein-/Ausgabe
 *
 * Verwaltet Mikrofon-Aufnahme (mit VAD/Auto-Stop bei Stille),
 * TTS-Audiowiedergabe und Metering fuer visuelle Feedback.
 * Nutzt react-native-audio-recorder-player fuer Aufnahme.
 */

import { Platform, PermissionsAndroid, NativeModules, ToastAndroid } from 'react-native';
import Sound from 'react-native-sound';
import RNFS from 'react-native-fs';
import AsyncStorage from '@react-native-async-storage/async-storage';
import AudioRecorderPlayer, {
  AudioEncoderAndroidType,
  AudioSourceAndroidType,
  AVEncodingOption,
  OutputFormatAndroidType,
} from 'react-native-audio-recorder-player';

// Base64-Encoder fuer Binary-Strings (Header-Bytes → Base64)
const B64_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';
function btoaSafe(bin: string): string {
  let out = '';
  const len = bin.length;
  for (let i = 0; i < len; i += 3) {
    const b1 = bin.charCodeAt(i) & 0xff;
    const b2 = i + 1 < len ? bin.charCodeAt(i + 1) & 0xff : 0;
    const b3 = i + 2 < len ? bin.charCodeAt(i + 2) & 0xff : 0;
    out += B64_CHARS[b1 >> 2];
    out += B64_CHARS[((b1 & 0x03) << 4) | (b2 >> 4)];
    out += i + 1 < len ? B64_CHARS[((b2 & 0x0f) << 2) | (b3 >> 6)] : '=';
    out += i + 2 < len ? B64_CHARS[b3 & 0x3f] : '=';
  }
  return out;
}

// Native Module fuer Audio-Focus (Ducking/Muten anderer Apps)
const { AudioFocus, PcmStreamPlayer } = NativeModules as {
  AudioFocus?: {
    requestDuck: () => Promise<boolean>;
    requestExclusive: () => Promise<boolean>;
    release: () => Promise<boolean>;
  };
  PcmStreamPlayer?: {
    start: (sampleRate: number, channels: number, prerollSeconds: number) => Promise<boolean>;
    writeChunk: (base64Pcm: string) => Promise<boolean>;
    end: () => Promise<boolean>;
    stop: () => Promise<boolean>;
  };
};

// --- Typen ---

export interface RecordingResult {
  /** Base64-kodierte Audiodaten */
  base64: string;
  /** Dauer in Millisekunden */
  durationMs: number;
  /** MIME-Type (z.B. audio/wav) */
  mimeType: string;
}

export type RecordingState = 'idle' | 'recording' | 'processing';

type RecordingStateCallback = (state: RecordingState) => void;
type MeterCallback = (db: number) => void;
type SilenceCallback = () => void;

// --- Konstanten ---

const AUDIO_SAMPLE_RATE = 16000;
const AUDIO_CHANNELS = 1;
const AUDIO_ENCODING = 'audio/wav';

// VAD (Voice Activity Detection) — Stille-Erkennung.
// Fallback-Werte falls die adaptive Baseline-Messung fehlschlaegt (z.B. weil
// das Mikro keine metering-Updates liefert). Adaptive Werte werden zur
// Laufzeit aus den ersten BASELINE_SAMPLES gemessen und auf baseline+offset
// gesetzt — funktioniert in lauten wie leisen Umgebungen.
const VAD_SILENCE_FALLBACK_DB = -38;   // Fallback Stille-Schwelle
const VAD_SPEECH_FALLBACK_DB = -22;    // Fallback Sprach-Schwelle
const VAD_SILENCE_OFFSET_DB = 6;       // Sprache = Baseline + 6dB
const VAD_SPEECH_OFFSET_DB = 12;       // sicheres Speech = Baseline + 12dB
const VAD_BASELINE_SAMPLES = 5;        // 5 × 100ms = 500ms Baseline
const VAD_SPEECH_MIN_MS = 500;         // ms Sprache bevor Aufnahme zaehlt — laenger = keine Huestler/Klopfer mehr

// VAD-Stille (in Sekunden) — wie lange Sprechpause toleriert wird, bevor
// die Aufnahme automatisch beendet wird. Einstellbar in den App-Settings.
export const VAD_SILENCE_DEFAULT_SEC = 2.8;
export const VAD_SILENCE_MIN_SEC = 1.0;
export const VAD_SILENCE_MAX_SEC = 8.0;
export const VAD_SILENCE_STORAGE_KEY = 'aria_vad_silence_sec';

// Konversations-Fenster (in Sekunden) — nach ARIA's Antwort hat der User so
// lange Zeit, im Gespraechsmodus weiter zu sprechen, ohne dass die Konversation
// beendet wird. Sprichst du im Fenster nichts → Konversation aus.
export const CONV_WINDOW_DEFAULT_SEC = 8.0;
export const CONV_WINDOW_MIN_SEC = 3.0;
export const CONV_WINDOW_MAX_SEC = 20.0;
export const CONV_WINDOW_STORAGE_KEY = 'aria_conv_window_sec';

// TTS-Wiedergabegeschwindigkeit — wird pro Geraet gespeichert und an die
// Bridge mitgegeben (speed-Param im F5-TTS infer()). 1.0 = normal.
export const TTS_SPEED_DEFAULT = 1.0;
export const TTS_SPEED_MIN = 0.1;
export const TTS_SPEED_MAX = 5.0;
export const TTS_SPEED_STORAGE_KEY = 'aria_tts_speed';

export async function loadTtsSpeed(): Promise<number> {
  try {
    const raw = await AsyncStorage.getItem(TTS_SPEED_STORAGE_KEY);
    if (raw != null) {
      const n = parseFloat(raw);
      if (isFinite(n) && n >= TTS_SPEED_MIN && n <= TTS_SPEED_MAX) return n;
    }
  } catch {}
  return TTS_SPEED_DEFAULT;
}

export async function loadConvWindowMs(): Promise<number> {
  try {
    const raw = await AsyncStorage.getItem(CONV_WINDOW_STORAGE_KEY);
    if (raw != null) {
      const n = parseFloat(raw);
      if (isFinite(n) && n >= CONV_WINDOW_MIN_SEC && n <= CONV_WINDOW_MAX_SEC) {
        return Math.round(n * 1000);
      }
    }
  } catch {}
  return Math.round(CONV_WINDOW_DEFAULT_SEC * 1000);
}

async function loadVadSilenceMs(): Promise<number> {
  try {
    const raw = await AsyncStorage.getItem(VAD_SILENCE_STORAGE_KEY);
    if (raw != null) {
      const n = parseFloat(raw);
      if (isFinite(n) && n >= VAD_SILENCE_MIN_SEC && n <= VAD_SILENCE_MAX_SEC) {
        return Math.round(n * 1000);
      }
    }
  } catch {}
  return Math.round(VAD_SILENCE_DEFAULT_SEC * 1000);
}

// Max-Dauer einer Aufnahme (Notbremse gegen Runaway-Loops). Auf 2 Minuten
// hochgezogen damit auch laengere Erklaerungen durchgehen.
// Default 5 Minuten — konfigurierbar in den App-Settings (1-30 Minuten).
export const MAX_RECORDING_DEFAULT_SEC = 300;
export const MAX_RECORDING_MIN_SEC = 60;
export const MAX_RECORDING_MAX_SEC = 1800;
export const MAX_RECORDING_STORAGE_KEY = 'aria_max_recording_sec';

export async function loadMaxRecordingMs(): Promise<number> {
  try {
    const raw = await AsyncStorage.getItem(MAX_RECORDING_STORAGE_KEY);
    if (raw != null) {
      const n = parseFloat(raw);
      if (isFinite(n) && n >= MAX_RECORDING_MIN_SEC && n <= MAX_RECORDING_MAX_SEC) {
        return Math.round(n * 1000);
      }
    }
  } catch {}
  return MAX_RECORDING_DEFAULT_SEC * 1000;
}

// Pre-Roll: Wie lange Audio im AudioTrack-Buffer liegt bevor play() startet.
// Einstellbar via Diagnostic/Settings (Key: aria_tts_preroll_sec).
export const TTS_PREROLL_DEFAULT_SEC = 3.5;
export const TTS_PREROLL_MIN_SEC = 0;     // 0 = sofort abspielen (F5-TTS ist schnell genug)
export const TTS_PREROLL_MAX_SEC = 6.0;
export const TTS_PREROLL_STORAGE_KEY = 'aria_tts_preroll_sec';

async function loadPrerollSec(): Promise<number> {
  try {
    const raw = await AsyncStorage.getItem(TTS_PREROLL_STORAGE_KEY);
    if (raw != null) {
      const n = parseFloat(raw);
      if (isFinite(n) && n >= TTS_PREROLL_MIN_SEC && n <= TTS_PREROLL_MAX_SEC) {
        return n;
      }
    }
  } catch {}
  return TTS_PREROLL_DEFAULT_SEC;
}

// --- Audio-Service ---

class AudioService {
  private recordingState: RecordingState = 'idle';
  private recordingStartTime: number = 0;
  private stateListeners: RecordingStateCallback[] = [];
  private meterListeners: MeterCallback[] = [];
  private silenceListeners: SilenceCallback[] = [];
  private currentSound: Sound | null = null;
  private recorder: AudioRecorderPlayer;
  private recordingPath: string = '';

  // Audio-Queue fuer sequentielle TTS-Wiedergabe
  private audioQueue: string[] = [];
  private isPlaying: boolean = false;
  private preloadedSound: Sound | null = null;
  private preloadedPath: string = '';

  // Sprach-Gate: Aufnahme erst senden wenn tatsaechlich gesprochen wurde
  private speechDetected: boolean = false;
  private speechStartTime: number = 0;

  // PCM-Stream (XTTS): aktive Session + Cache-Puffer pro messageId
  private pcmStreamActive: boolean = false;
  private pcmMessageId: string = '';
  private pcmSampleRate: number = 24000;
  private pcmChannels: number = 1;
  private pcmBuffer: string[] = []; // base64-chunks zum spaeteren WAV-Build
  private pcmBytesCollected: number = 0;
  private readonly PCM_MAX_CACHE_BYTES = 30 * 1024 * 1024; // 30MB

  // AudioFocus wird verzoegert freigegeben — wenn ARIA eine zweite Antwort
  // direkt hinterherschickt (oder ein neuer Stream startet), bleibt Spotify
  // pausiert. Ohne diese Verzoegerung springt Spotify im Mikro-Sekunden-Gap
  // zwischen zwei Streams kurz wieder an.
  private focusReleaseTimer: ReturnType<typeof setTimeout> | null = null;
  private readonly FOCUS_RELEASE_DELAY_MS = 800;

  // Conversation-Mode: solange aktiv (Wake-Word Status 'conversing' ODER
  // wir wissen "ARIA spricht gerade in einem Multi-Turn-Dialog"), halten wir
  // den AudioFocus DAUERHAFT. Der per-Stream-Release wird unterdrueckt,
  // damit Spotify nicht in Render-Pausen oder zwischen Antworten zurueckkehrt.
  private _conversationFocusActive: boolean = false;

  // VAD State
  private vadEnabled: boolean = false;
  private lastSpeechTime: number = 0;
  private vadTimer: ReturnType<typeof setInterval> | null = null;
  private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
  // Latch damit der Silence-Callback pro Aufnahme genau einmal feuert
  private silenceFired: boolean = false;
  private noSpeechTimer: ReturnType<typeof setTimeout> | null = null;
  // Adaptive Schwellen — werden in den ersten 500ms aus dem Mikro-Pegel
  // gemessen. baseline = avg dB der ersten 5 Samples, dann:
  //   silence  = baseline + VAD_SILENCE_OFFSET_DB  (6dB ueber ambient)
  //   speech   = baseline + VAD_SPEECH_OFFSET_DB   (12dB ueber ambient = klares Reden)
  // Funktioniert sowohl im stillen Buero als auch im lauten Cafe.
  private vadBaselineSamples: number[] = [];
  private vadAdaptiveSilenceDb: number = VAD_SILENCE_FALLBACK_DB;
  private vadAdaptiveSpeechDb: number = VAD_SPEECH_FALLBACK_DB;

  constructor() {
    this.recorder = new AudioRecorderPlayer();
    this.recorder.setSubscriptionDuration(0.1); // 100ms Metering-Updates
  }

  /** AudioFocus mit kleiner Verzoegerung freigeben — Spotify/YouTube
   *  springen sonst im Gap zwischen zwei TTS-Streams (oder wenn ARIA
   *  eine zweite Antwort direkt hinterherschickt) kurz wieder an.
   *  Im Conversation-Mode (Wake-Word conversing) wird das Release komplett
   *  unterdrueckt — der Focus bleibt fuer die ganze Konversation gehalten. */
  private _releaseFocusDeferred(): void {
    if (this._conversationFocusActive) {
      this._cancelDeferredFocusRelease();
      return;
    }
    this._cancelDeferredFocusRelease();
    this.focusReleaseTimer = setTimeout(() => {
      this.focusReleaseTimer = null;
      if (this._conversationFocusActive) return;
      AudioFocus?.release().catch(() => {});
    }, this.FOCUS_RELEASE_DELAY_MS);
  }

  private _cancelDeferredFocusRelease(): void {
    if (this.focusReleaseTimer) {
      clearTimeout(this.focusReleaseTimer);
      this.focusReleaseTimer = null;
    }
  }

  /** Conversation-Mode beginnt → AudioFocus dauerhaft halten (Spotify bleibt
   *  pausiert). Idempotent: mehrfaches Aufrufen ist sicher. */
  acquireConversationFocus(): void {
    if (this._conversationFocusActive) return;
    this._conversationFocusActive = true;
    this._cancelDeferredFocusRelease();
    console.log('[Audio] Conversation-Focus aktiv (Spotify bleibt gepaust)');
    AudioFocus?.requestDuck().catch(() => {});
  }

  /** Conversation-Mode endet → Focus darf wieder freigegeben werden
   *  (verzoegert, damit eine direkt folgende Antwort nichts kaputtmacht). */
  releaseConversationFocus(): void {
    if (!this._conversationFocusActive) return;
    this._conversationFocusActive = false;
    console.log('[Audio] Conversation-Focus inaktiv');
    this._releaseFocusDeferred();
  }

  /** TTS-Wiedergabe haart stoppen — z.B. wenn ein Anruf reinkommt.
   *  Released auch sofort den AudioFocus damit der Anruf-Klingelton hoerbar ist. */
  haltAllPlayback(reason: string = ''): void {
    console.log('[Audio] haltAllPlayback: %s', reason || '(no reason)');
    this._conversationFocusActive = false;
    this.stopPlayback();
  }

  /** True wenn ARIA gerade was abspielt — egal ob WAV-Queue oder PCM-Stream.
   *  Nuetzlich fuer "Barge-In": wenn der User spricht waehrend ARIA spricht,
   *  soll die ARIA-Wiedergabe abgebrochen + die neue User-Message verarbeitet
   *  werden ("ach vergiss es, mach lieber X"). */
  isPlayingAudio(): boolean {
    return this.isPlaying || this.pcmStreamActive;
  }

  // --- Berechtigungen ---

  async requestMicrophonePermission(): Promise<boolean> {
    if (Platform.OS !== 'android') {
      return true;
    }

    try {
      const granted = await PermissionsAndroid.request(
        PermissionsAndroid.PERMISSIONS.RECORD_AUDIO,
        {
          title: 'ARIA Cockpit - Mikrofon',
          message: 'ARIA benoetigt Zugriff auf das Mikrofon fuer Spracheingabe.',
          buttonPositive: 'Erlauben',
          buttonNegative: 'Ablehnen',
        },
      );
      return granted === PermissionsAndroid.RESULTS.GRANTED;
    } catch (err) {
      console.error('[Audio] Fehler bei Berechtigungsanfrage:', err);
      return false;
    }
  }

  // --- Aufnahme ---

  /** Mikrofon-Aufnahme starten.
   *
   *  @param autoStop          VAD aktivieren — Auto-Stop bei Stille
   *  @param noSpeechTimeoutMs Wenn der User innerhalb dieser Zeit nichts sagt,
   *                           wird Stille gemeldet (Recording wird verworfen).
   *                           Fuer Conversation-Window: nach ARIA's Antwort
   *                           hast du nur N Sekunden um anzufangen, sonst
   *                           Gespraech zu Ende.
   */
  async startRecording(autoStop: boolean = false, noSpeechTimeoutMs: number = 0): Promise<boolean> {
    if (this.recordingState !== 'idle') {
      console.warn('[Audio] Aufnahme laeuft bereits');
      return false;
    }

    const hasPermission = await this.requestMicrophonePermission();
    if (!hasPermission) {
      console.warn('[Audio] Keine Mikrofon-Berechtigung');
      return false;
    }

    try {
      // Laufende Wiedergabe stoppen (damit ARIA sich nicht selbst hoert)
      this.stopPlayback();

      // Aufraeumen: Alte aria_recording_ und aria_tts_ Files loeschen
      // (Schutz gegen Cache-Ueberlauf im Gespraechsmodus bei vielen Zyklen)
      this._cleanupStaleCacheFiles().catch(() => {});

      this.recordingPath = `${RNFS.CachesDirectoryPath}/aria_recording_${Date.now()}.mp4`;

      // Aufnahme mit Metering starten
      await this.recorder.startRecorder(this.recordingPath, {
        AudioEncoderAndroid: AudioEncoderAndroidType.AAC,
        AudioSourceAndroid: AudioSourceAndroidType.MIC,
        OutputFormatAndroid: OutputFormatAndroidType.MPEG_4,
        AudioSamplingRateAndroid: 16000,
        AudioChannelsAndroid: 1,
      }, true); // meteringEnabled = true

      // Metering-Callback
      this.recorder.addRecordBackListener((e) => {
        const db = e.currentMetering ?? -160;
        this.meterListeners.forEach(cb => cb(db));

        // Adaptive Baseline: erste 5 Samples (~500ms) sammeln, dann Schwellen
        // anpassen. -160 (kein Metering) ignorieren — sonst wird die Baseline
        // sinnlos niedrig.
        if (this.vadBaselineSamples.length < VAD_BASELINE_SAMPLES) {
          if (db > -100) {
            this.vadBaselineSamples.push(db);
            if (this.vadBaselineSamples.length === VAD_BASELINE_SAMPLES) {
              // Minimum statt Mittelwert: robust gegen Spike-Samples (z.B. wenn
              // der User direkt nach Wake-Word sofort spricht oder das Wake-Word-
              // Echo noch im Mikro ist). Min ist der ruhigste Moment.
              const lowest = Math.min(...this.vadBaselineSamples);
              const rawSilence = lowest + VAD_SILENCE_OFFSET_DB;
              const rawSpeech = lowest + VAD_SPEECH_OFFSET_DB;
              // Cap auf einen vernuenftigen Bereich:
              // - Silence-Schwelle nicht ueber -28dB (sonst zaehlt Hintergrund-
              //   geraeusch dauerhaft als "Sprache" → VAD feuert nie)
              // - Silence-Schwelle nicht unter -50dB (sonst zu strikt)
              this.vadAdaptiveSilenceDb = Math.max(-50, Math.min(rawSilence, -28));
              this.vadAdaptiveSpeechDb = Math.max(-40, Math.min(rawSpeech, -18));
              const msg = `VAD: ambient=${lowest.toFixed(0)}dB stille>${this.vadAdaptiveSilenceDb.toFixed(0)}dB`;
              console.log('[Audio] %s speech>%s (raw silence=%s speech=%s)',
                          msg, this.vadAdaptiveSpeechDb.toFixed(1),
                          rawSilence.toFixed(1), rawSpeech.toFixed(1));
              try { ToastAndroid.show(msg, ToastAndroid.SHORT); } catch {}
            }
          }
        }

        // Sprach-Gate: Erkennen ob tatsaechlich gesprochen wird
        if (db > this.vadAdaptiveSpeechDb) {
          if (!this.speechDetected && this.speechStartTime === 0) {
            this.speechStartTime = Date.now();
          }
          if (this.speechStartTime > 0 && Date.now() - this.speechStartTime >= VAD_SPEECH_MIN_MS) {
            this.speechDetected = true;
          }
        } else {
          if (!this.speechDetected) {
            this.speechStartTime = 0; // Reset wenn noch nicht als Sprache erkannt
          }
        }

        // VAD: Stille erkennen (nur wenn Sprache erkannt wurde)
        if (this.vadEnabled) {
          if (db > this.vadAdaptiveSilenceDb) {
            this.lastSpeechTime = Date.now();
          }
        }
      });

      this.recordingStartTime = Date.now();
      this.lastSpeechTime = Date.now();
      this.speechDetected = false;
      this.speechStartTime = 0;
      // VAD-Adaptive zurueckgesetzt: Baseline wird in den ersten 500ms neu
      // gemessen. Bis dahin gelten die Fallback-Schwellen — die sind etwas
      // empfindlicher als die alten Werte (-38 statt -45 fuer Stille).
      this.vadBaselineSamples = [];
      this.vadAdaptiveSilenceDb = VAD_SILENCE_FALLBACK_DB;
      this.vadAdaptiveSpeechDb = VAD_SPEECH_FALLBACK_DB;
      this.setState('recording');

      // Andere Apps waehrend der Aufnahme pausieren (Musik, Videos etc.)
      this._cancelDeferredFocusRelease();
      AudioFocus?.requestExclusive().catch(() => {});

      // VAD aktivieren — Stille-Dauer aus AsyncStorage (Settings-konfigurierbar).
      // WICHTIG: jeder Trigger (VAD-Stille / Max-Dauer / No-Speech-Window)
      // disable SOFORT den VAD-Flag und clear den Timer, BEVOR die Listener
      // gefeuert werden. Sonst feuert das setInterval weiter alle 200ms und
      // ruft stopRecording parallel auf → audio-recorder-player crasht.
      this.vadEnabled = autoStop;
      this.silenceFired = false;
      const fireSilenceOnce = (reason: string) => {
        if (this.silenceFired) return;
        this.silenceFired = true;
        this.vadEnabled = false;
        if (this.vadTimer) { clearInterval(this.vadTimer); this.vadTimer = null; }
        if (this.maxDurationTimer) { clearTimeout(this.maxDurationTimer); this.maxDurationTimer = null; }
        if (this.noSpeechTimer) { clearTimeout(this.noSpeechTimer); this.noSpeechTimer = null; }
        console.log('[Audio] Silence-Fire: %s', reason);
        this.silenceListeners.forEach(cb => {
          try { cb(); } catch (e) { console.warn('[Audio] silence listener err:', e); }
        });
      };
      if (autoStop) {
        const vadSilenceMs = await loadVadSilenceMs();
        const maxRecordingMs = await loadMaxRecordingMs();
        console.log('[Audio] startRecording: autoStop=true, VAD-Stille=%dms, MAX=%dms',
                    vadSilenceMs, maxRecordingMs);
        this.vadTimer = setInterval(() => {
          const silenceDuration = Date.now() - this.lastSpeechTime;
          if (silenceDuration >= vadSilenceMs) {
            fireSilenceOnce(`VAD ${silenceDuration}ms Stille (Schwelle=${vadSilenceMs}ms)`);
          }
        }, 200);
        // Notbremse: Nach maxRecordingMs zwangsweise stoppen
        this.maxDurationTimer = setTimeout(() => {
          fireSilenceOnce(`Max-Dauer ${maxRecordingMs}ms`);
        }, maxRecordingMs);
      }

      // Conversation-Window: Wenn der User innerhalb noSpeechTimeoutMs nicht
      // anfaengt zu sprechen → Aufnahme abbrechen (Speech-Gate verwirft sie).
      if (noSpeechTimeoutMs > 0) {
        this.noSpeechTimer = setTimeout(() => {
          if (!this.speechDetected && this.recordingState === 'recording') {
            fireSilenceOnce(`Conversation-Window ${noSpeechTimeoutMs}ms ohne Sprache`);
          }
        }, noSpeechTimeoutMs);
      }

      console.log('[Audio] Aufnahme gestartet (autoStop: %s)', autoStop);
      return true;
    } catch (err) {
      console.error('[Audio] Fehler beim Starten der Aufnahme:', err);
      this.setState('idle');
      return false;
    }
  }

  /** Aufnahme stoppen und Ergebnis zurueckgeben */
  async stopRecording(): Promise<RecordingResult | null> {
    if (this.recordingState !== 'recording') {
      console.warn('[Audio] Keine aktive Aufnahme');
      return null;
    }

    this.setState('processing');
    this.vadEnabled = false;
    if (this.vadTimer) {
      clearInterval(this.vadTimer);
      this.vadTimer = null;
    }
    if (this.maxDurationTimer) {
      clearTimeout(this.maxDurationTimer);
      this.maxDurationTimer = null;
    }
    if (this.noSpeechTimer) {
      clearTimeout(this.noSpeechTimer);
      this.noSpeechTimer = null;
    }

    try {
      await this.recorder.stopRecorder();
      this.recorder.removeRecordBackListener();

      // Audio-Focus verzoegert freigeben — gleich kommt die TTS-Antwort,
      // im Gap soll Spotify nicht hochkommen.
      this._releaseFocusDeferred();

      const durationMs = Date.now() - this.recordingStartTime;
      const hadSpeech = this.speechDetected;

      // Sprach-Gate: Wenn keine Sprache erkannt → Aufnahme verwerfen
      if (!hadSpeech) {
        RNFS.unlink(this.recordingPath).catch(() => {});
        this.setState('idle');
        console.log('[Audio] Aufnahme verworfen — keine Sprache erkannt (nur Umgebungsgeraeusche)');
        return null;
      }

      // Audio-Datei als Base64 lesen
      const base64Data = await RNFS.readFile(this.recordingPath, 'base64');

      // Temp-Datei aufraeumen
      RNFS.unlink(this.recordingPath).catch(() => {});

      this.setState('idle');
      console.log(`[Audio] Aufnahme beendet (${durationMs}ms, ${Math.round(base64Data.length / 1024)}KB, Sprache erkannt)`);

      return {
        base64: base64Data,
        durationMs,
        mimeType: 'audio/mp4', // AAC in MP4 Container
      };
    } catch (err) {
      console.error('[Audio] Fehler beim Stoppen der Aufnahme:', err);
      this.setState('idle');
      return null;
    }
  }

  // --- Wiedergabe ---

  /** Base64-kodiertes Audio in die Queue stellen und abspielen */
  async playAudio(base64Data: string): Promise<void> {
    if (!base64Data) return;

    this.audioQueue.push(base64Data);
    if (!this.isPlaying) {
      this._playNext();
    }
  }

  /** Base64-Audio persistent speichern. Gibt file:// Pfad zurueck (oder leer bei Fehler). */
  async cacheAudio(base64Data: string, messageId: string): Promise<string> {
    if (!base64Data || !messageId) return '';
    try {
      const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`;
      await RNFS.mkdir(dir).catch(() => {});
      const path = `${dir}/${messageId}.wav`;
      // Wenn Datei schon existiert (z.B. XTTS Chunks) → anhaengen statt ueberschreiben
      const exists = await RNFS.exists(path);
      if (exists) {
        // Bestehende + neue Base64 laden, zusammenkleben (fuer jetzt: ueberschreiben)
        // XTTS sendet mehrere Chunks — bei mehrfacher Ueberschreibung bleibt nur der letzte
        // Fuer eine echte Konkatenation muesste WAV-Header gemerged werden
        await RNFS.writeFile(path, base64Data, 'base64');
      } else {
        await RNFS.writeFile(path, base64Data, 'base64');
      }
      return `file://${path}`;
    } catch (err) {
      console.warn('[Audio] cacheAudio fehlgeschlagen:', err);
      return '';
    }
  }

  /** Einen PCM-Chunk aus einer audio_pcm Nachricht empfangen.
   *  silent=true → nur cachen, nicht abspielen (z.B. wenn TTS geraetelokal gemutet).
   *  Gibt bei final=true den Cache-Pfad zurueck (file://) oder '' wenn nicht gecached.
   *
   *  Wrapper serialisiert aufeinanderfolgende Chunk-Calls via Promise-Queue —
   *  sonst gabs bei kurzen Streams einen Race: final-Chunk konnte `end()` rufen
   *  BEVOR der vorherige `start()` im Native-Modul fertig war. Der Writer-
   *  Thread sah dann endRequested=true ohne jemals Chunks zu verarbeiten. */
  private _pcmChunkQueue: Promise<any> = Promise.resolve();
  async handlePcmChunk(payload: {
    base64: string;
    sampleRate?: number;
    channels?: number;
    messageId?: string;
    chunk?: number;
    final?: boolean;
    silent?: boolean;
  }): Promise<string> {
    const p = this._pcmChunkQueue.then(() => this._handlePcmChunkImpl(payload)).catch(err => {
      console.warn('[Audio] handlePcmChunk queued err:', err);
      return '';
    });
    // Chain only on the side effect — callers still get the per-call result
    this._pcmChunkQueue = p;
    return p;
  }

  private async _handlePcmChunkImpl(payload: {
    base64: string;
    sampleRate?: number;
    channels?: number;
    messageId?: string;
    chunk?: number;
    final?: boolean;
    silent?: boolean;
  }): Promise<string> {
    const silent = !!payload.silent;
    if (!silent && !PcmStreamPlayer) {
      console.warn('[Audio] PcmStreamPlayer Native Module nicht verfuegbar');
      return '';
    }
    // Debug-Log bei Chunk 0 eines neuen Streams — damit man im adb logcat
    // sieht warum der Auto-Playback greift oder nicht.
    if ((payload.chunk ?? 0) === 0 && !this.pcmStreamActive) {
      console.log('[Audio] PCM-Stream start: silent=%s messageId=%s sr=%s ch=%s',
                  silent, payload.messageId || '(none)',
                  payload.sampleRate, payload.channels);
    }

    const messageId = payload.messageId || '';
    const sampleRate = payload.sampleRate || 24000;
    const channels = payload.channels || 1;
    const base64 = payload.base64 || '';
    const isFinal = !!payload.final;

    // Neuer Stream? (messageId Wechsel oder nicht aktiv)
    if (!this.pcmStreamActive || this.pcmMessageId !== messageId) {
      if (this.pcmStreamActive && !silent) {
        try { await PcmStreamPlayer!.stop(); } catch {}
        this.pcmBuffer = [];
        this.pcmBytesCollected = 0;
      }
      this.pcmStreamActive = true;
      this.pcmMessageId = messageId;
      this.pcmSampleRate = sampleRate;
      this.pcmChannels = channels;
      this.pcmBuffer = [];
      this.pcmBytesCollected = 0;
      if (!silent) {
        const prerollSec = await loadPrerollSec();
        try {
          await PcmStreamPlayer!.start(sampleRate, channels, prerollSec);
        } catch (err) {
          console.error('[Audio] PcmStreamPlayer.start fehlgeschlagen:', err);
          this.pcmStreamActive = false;
          return '';
        }
        this._cancelDeferredFocusRelease();
        AudioFocus?.requestDuck().catch(() => {});
        this._firePlaybackStarted();
      }
    }

    // Chunk — immer cachen, nur bei !silent auch abspielen
    if (base64) {
      if (!silent) {
        try { await PcmStreamPlayer!.writeChunk(base64); } catch (err) { console.warn('[Audio] writeChunk', err); }
      }
      if (messageId && this.pcmBytesCollected < this.PCM_MAX_CACHE_BYTES) {
        this.pcmBuffer.push(base64);
        this.pcmBytesCollected += Math.floor(base64.length * 0.75);
      }
    }

    if (isFinal) {
      if (!silent) {
        // end() resolved jetzt erst wenn der native Writer-Thread fertig
        // ist (alle Samples ausgespielt) — danach AudioFocus verzoegert
        // freigeben, damit Spotify/YouTube nicht im Mikro-Gap zwischen zwei
        // ARIA-Antworten wieder hochdrehen. Wenn ein neuer Stream innerhalb
        // FOCUS_RELEASE_DELAY_MS startet, wird das Release abgebrochen.
        try { await PcmStreamPlayer!.end(); } catch {}
        this._releaseFocusDeferred();
      }
      this.pcmStreamActive = false;

      if (messageId && this.pcmBuffer.length > 0) {
        const audioPath = await this._savePcmBufferAsWav(messageId);
        this.pcmBuffer = [];
        this.pcmBytesCollected = 0;
        this.pcmMessageId = '';
        return audioPath;
      }
      this.pcmMessageId = '';
    }
    return '';
  }

  /** Gesammelte PCM-Chunks als WAV speichern. Gibt file:// Pfad zurueck. */
  private async _savePcmBufferAsWav(messageId: string): Promise<string> {
    try {
      const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`;
      await RNFS.mkdir(dir).catch(() => {});
      const path = `${dir}/${messageId}.wav`;

      // WAV-Header fuer PCM s16le
      const sampleRate = this.pcmSampleRate;
      const channels = this.pcmChannels;
      const bitsPerSample = 16;
      const byteRate = sampleRate * channels * bitsPerSample / 8;
      const blockAlign = channels * bitsPerSample / 8;
      const dataSize = this.pcmBytesCollected;
      const fileSize = 36 + dataSize;

      // Header als Base64 (44 bytes)
      const header = new Uint8Array(44);
      const dv = new DataView(header.buffer);
      // "RIFF"
      header[0] = 0x52; header[1] = 0x49; header[2] = 0x46; header[3] = 0x46;
      dv.setUint32(4, fileSize, true);
      // "WAVE"
      header[8] = 0x57; header[9] = 0x41; header[10] = 0x56; header[11] = 0x45;
      // "fmt "
      header[12] = 0x66; header[13] = 0x6d; header[14] = 0x74; header[15] = 0x20;
      dv.setUint32(16, 16, true);  // fmt chunk size
      dv.setUint16(20, 1, true);    // PCM format
      dv.setUint16(22, channels, true);
      dv.setUint32(24, sampleRate, true);
      dv.setUint32(28, byteRate, true);
      dv.setUint16(32, blockAlign, true);
      dv.setUint16(34, bitsPerSample, true);
      // "data"
      header[36] = 0x64; header[37] = 0x61; header[38] = 0x74; header[39] = 0x61;
      dv.setUint32(40, dataSize, true);

      // Header als base64
      let headerB64 = '';
      const chunk = 1024;
      for (let i = 0; i < header.length; i += chunk) {
        headerB64 += String.fromCharCode(...Array.from(header.slice(i, i + chunk)));
      }
      headerB64 = btoaSafe(headerB64);

      // Datei schreiben: Header + alle PCM-Chunks
      await RNFS.writeFile(path, headerB64, 'base64');
      for (const b64 of this.pcmBuffer) {
        await RNFS.appendFile(path, b64, 'base64');
      }
      console.log(`[Audio] PCM-Cache geschrieben: ${path} (${(dataSize / 1024).toFixed(0)}KB, ${this.pcmBuffer.length} chunks)`);
      return `file://${path}`;
    } catch (err) {
      console.warn('[Audio] _savePcmBufferAsWav fehlgeschlagen:', err);
      return '';
    }
  }

  /** Audio aus lokaler Datei (file:// Pfad) in die Queue und abspielen. */
  async playFromPath(filePath: string): Promise<void> {
    if (!filePath) return;
    try {
      const cleanPath = filePath.replace(/^file:\/\//, '');
      if (!(await RNFS.exists(cleanPath))) {
        console.warn('[Audio] Cache-Datei existiert nicht mehr:', cleanPath);
        return;
      }
      const b64 = await RNFS.readFile(cleanPath, 'base64');
      this.playAudio(b64);
    } catch (err) {
      console.warn('[Audio] playFromPath fehlgeschlagen:', err);
    }
  }

  // Callback wenn alle Audio-Teile abgespielt sind
  private playbackFinishedListeners: (() => void)[] = [];
  private playbackStartedListeners: (() => void)[] = [];

  onPlaybackFinished(callback: () => void): () => void {
    this.playbackFinishedListeners.push(callback);
    return () => {
      this.playbackFinishedListeners = this.playbackFinishedListeners.filter(cb => cb !== callback);
    };
  }

  /** Callback wenn ARIAs TTS-Wiedergabe startet — fuer Wake-Word-parallel-
   *  Listening waehrend ARIA spricht (Barge-In via "Computer" sagen). */
  onPlaybackStarted(callback: () => void): () => void {
    this.playbackStartedListeners.push(callback);
    return () => {
      this.playbackStartedListeners = this.playbackStartedListeners.filter(cb => cb !== callback);
    };
  }

  private _firePlaybackStarted(): void {
    this.playbackStartedListeners.forEach(cb => {
      try { cb(); } catch (e) { console.warn('[Audio] playbackStarted listener err:', e); }
    });
  }

  /** Naechstes Audio aus der Queue abspielen */
  private async _playNext(): Promise<void> {
    if (this.audioQueue.length === 0) {
      this.isPlaying = false;
      // Audio-Focus verzoegert abgeben → wenn gleich noch eine Antwort kommt,
      // bleibt Spotify pausiert.
      this._releaseFocusDeferred();
      // Alle Audio-Teile abgespielt → Listener benachrichtigen
      this.playbackFinishedListeners.forEach(cb => cb());
      return;
    }

    // Beim ersten Playback-Start: andere Apps ducken + Listener informieren
    if (!this.isPlaying) {
      this._cancelDeferredFocusRelease();
      AudioFocus?.requestDuck().catch(() => {});
      this._firePlaybackStarted();
    }
    this.isPlaying = true;

    // Preloaded Sound verwenden wenn verfuegbar, sonst neu laden
    let sound: Sound;
    let soundPath: string;

    if (this.preloadedSound) {
      sound = this.preloadedSound;
      soundPath = this.preloadedPath;
      this.preloadedSound = null;
      this.preloadedPath = '';
      // Daten aus Queue entfernen (wurde schon preloaded)
      this.audioQueue.shift();
    } else {
      const base64Data = this.audioQueue.shift()!;
      try {
        soundPath = `${RNFS.CachesDirectoryPath}/aria_tts_${Date.now()}.wav`;
        await RNFS.writeFile(soundPath, base64Data, 'base64');
        sound = await new Promise<Sound>((resolve, reject) => {
          const s = new Sound(soundPath, '', (err) => err ? reject(err) : resolve(s));
        });
      } catch (err) {
        console.error('[Audio] Laden fehlgeschlagen:', err);
        this._playNext();
        return;
      }
    }

    this.currentSound = sound;

    // Naechstes Audio schon vorbereiten waehrend dieses abspielt
    this._preloadNext();

    sound.play((success) => {
      if (!success) console.warn('[Audio] Wiedergabe fehlgeschlagen');
      sound.release();
      this.currentSound = null;
      RNFS.unlink(soundPath).catch(() => {});
      this._playNext();
    });
  }

  /** Naechstes Audio im Hintergrund vorladen (verhindert Stottern) */
  private async _preloadNext(): Promise<void> {
    if (this.audioQueue.length === 0 || this.preloadedSound) return;

    const base64Data = this.audioQueue[0]; // Nicht shift — bleibt in Queue
    try {
      const tmpPath = `${RNFS.CachesDirectoryPath}/aria_tts_pre_${Date.now()}.wav`;
      await RNFS.writeFile(tmpPath, base64Data, 'base64');
      this.preloadedSound = await new Promise<Sound>((resolve, reject) => {
        const s = new Sound(tmpPath, '', (err) => err ? reject(err) : resolve(s));
      });
      this.preloadedPath = tmpPath;
    } catch {
      this.preloadedSound = null;
      this.preloadedPath = '';
    }
  }

  /** Laufende Wiedergabe stoppen + Queue leeren */
  stopPlayback(): void {
    this.audioQueue = [];
    this.isPlaying = false;
    if (this.currentSound) {
      this.currentSound.stop();
      this.currentSound.release();
      this.currentSound = null;
    }
    if (this.preloadedSound) {
      this.preloadedSound.release();
      this.preloadedSound = null;
      if (this.preloadedPath) RNFS.unlink(this.preloadedPath).catch(() => {});
      this.preloadedPath = '';
    }
    // PCM-Stream ebenfalls hart stoppen (Cancel/Abbruch)
    if (this.pcmStreamActive) {
      PcmStreamPlayer?.stop().catch(() => {});
      this.pcmStreamActive = false;
      this.pcmBuffer = [];
      this.pcmBytesCollected = 0;
      this.pcmMessageId = '';
    }
    // Audio-Focus sofort freigeben — User hat explizit abgebrochen
    this._cancelDeferredFocusRelease();
    AudioFocus?.release().catch(() => {});
  }

  // --- Status & Callbacks ---

  getRecordingState(): RecordingState {
    return this.recordingState;
  }

  /** Callback fuer Aufnahmestatus-Aenderungen */
  onStateChange(callback: RecordingStateCallback): () => void {
    this.stateListeners.push(callback);
    return () => {
      this.stateListeners = this.stateListeners.filter(cb => cb !== callback);
    };
  }

  /** Callback fuer Metering-Updates (dB Werte waehrend Aufnahme) */
  onMeterUpdate(callback: MeterCallback): () => void {
    this.meterListeners.push(callback);
    return () => {
      this.meterListeners = this.meterListeners.filter(cb => cb !== callback);
    };
  }

  /** Callback wenn VAD Stille erkennt (Auto-Stop) */
  onSilenceDetected(callback: SilenceCallback): () => void {
    this.silenceListeners.push(callback);
    return () => {
      this.silenceListeners = this.silenceListeners.filter(cb => cb !== callback);
    };
  }

  private setState(state: RecordingState): void {
    if (this.recordingState !== state) {
      this.recordingState = state;
      this.stateListeners.forEach(cb => cb(state));
    }
  }

  /** Alte Aufnahme- und TTS-Files aus dem Cache loeschen (>30s alt). */
  private async _cleanupStaleCacheFiles(): Promise<void> {
    try {
      const files = await RNFS.readDir(RNFS.CachesDirectoryPath);
      const now = Date.now();
      for (const f of files) {
        if (!f.isFile()) continue;
        if (!f.name.startsWith('aria_recording_') && !f.name.startsWith('aria_tts_')) continue;
        const age = now - (f.mtime ? f.mtime.getTime() : 0);
        if (age > 30000) {
          await RNFS.unlink(f.path).catch(() => {});
        }
      }
    } catch {
      // silent — cleanup ist best-effort
    }
  }

  /** Alte TTS-Cache-Dateien loeschen die nicht mehr referenziert sind (>30 Tage). */
  async cleanupOldTTSCache(keepMessageIds: Set<string>, maxAgeDays = 30): Promise<void> {
    try {
      const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`;
      if (!(await RNFS.exists(dir))) return;
      const files = await RNFS.readDir(dir);
      const maxAgeMs = maxAgeDays * 24 * 60 * 60 * 1000;
      const now = Date.now();
      for (const f of files) {
        if (!f.isFile() || !f.name.endsWith('.wav')) continue;
        const messageId = f.name.replace(/\.wav$/, '');
        const age = now - (f.mtime ? f.mtime.getTime() : 0);
        // Loeschen wenn: nicht mehr referenziert UND aelter als X Tage
        if (!keepMessageIds.has(messageId) && age > maxAgeMs) {
          await RNFS.unlink(f.path).catch(() => {});
        }
      }
    } catch {
      // silent
    }
  }
}

// Singleton
const audioService = new AudioService();
export default audioService;