/** * Audio-Service fuer Sprach-Ein-/Ausgabe * * Verwaltet Mikrofon-Aufnahme (mit VAD/Auto-Stop bei Stille), * TTS-Audiowiedergabe und Metering fuer visuelle Feedback. * Nutzt react-native-audio-recorder-player fuer Aufnahme. */ import { Platform, PermissionsAndroid, NativeModules, ToastAndroid, NativeEventEmitter } from 'react-native'; import Sound from 'react-native-sound'; import RNFS from 'react-native-fs'; import AsyncStorage from '@react-native-async-storage/async-storage'; import { acquireBackgroundAudio, releaseBackgroundAudio, stopBackgroundAudio } from './backgroundAudio'; import AudioRecorderPlayer, { AudioEncoderAndroidType, AudioSourceAndroidType, AVEncodingOption, OutputFormatAndroidType, } from 'react-native-audio-recorder-player'; // Base64-Encoder fuer Binary-Strings (Header-Bytes → Base64) const B64_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'; function btoaSafe(bin: string): string { let out = ''; const len = bin.length; for (let i = 0; i < len; i += 3) { const b1 = bin.charCodeAt(i) & 0xff; const b2 = i + 1 < len ? bin.charCodeAt(i + 1) & 0xff : 0; const b3 = i + 2 < len ? bin.charCodeAt(i + 2) & 0xff : 0; out += B64_CHARS[b1 >> 2]; out += B64_CHARS[((b1 & 0x03) << 4) | (b2 >> 4)]; out += i + 1 < len ? B64_CHARS[((b2 & 0x0f) << 2) | (b3 >> 6)] : '='; out += i + 2 < len ? B64_CHARS[b3 & 0x3f] : '='; } return out; } // Native Module fuer Audio-Focus (Ducking/Muten anderer Apps) const { AudioFocus, PcmStreamPlayer } = NativeModules as { AudioFocus?: { requestDuck: () => Promise; requestExclusive: () => Promise; release: () => Promise; kickReleaseMedia: () => Promise; getMode?: () => Promise; }; PcmStreamPlayer?: { start: (sampleRate: number, channels: number, prerollSeconds: number) => Promise; writeChunk: (base64Pcm: string) => Promise; end: () => Promise; stop: () => Promise; }; }; // --- Typen --- export interface RecordingResult { /** Base64-kodierte Audiodaten */ base64: string; /** Dauer in Millisekunden */ durationMs: number; /** MIME-Type (z.B. audio/wav) */ mimeType: string; } export type RecordingState = 'idle' | 'recording' | 'processing'; type RecordingStateCallback = (state: RecordingState) => void; type MeterCallback = (db: number) => void; type SilenceCallback = () => void; // --- Konstanten --- const AUDIO_SAMPLE_RATE = 16000; const AUDIO_CHANNELS = 1; const AUDIO_ENCODING = 'audio/wav'; // VAD (Voice Activity Detection) — Stille-Erkennung. // Fallback-Werte falls die adaptive Baseline-Messung fehlschlaegt (z.B. weil // das Mikro keine metering-Updates liefert). Adaptive Werte werden zur // Laufzeit aus den ersten BASELINE_SAMPLES gemessen und auf baseline+offset // gesetzt — funktioniert in lauten wie leisen Umgebungen. const VAD_SILENCE_FALLBACK_DB = -38; // Fallback Stille-Schwelle const VAD_SPEECH_FALLBACK_DB = -22; // Fallback Sprach-Schwelle const VAD_SILENCE_OFFSET_DB = 6; // Sprache = Baseline + 6dB const VAD_SPEECH_OFFSET_DB = 12; // sicheres Speech = Baseline + 12dB const VAD_BASELINE_SAMPLES = 5; // 5 × 100ms = 500ms Baseline const VAD_SPEECH_MIN_MS = 500; // ms Sprache bevor Aufnahme zaehlt — laenger = keine Huestler/Klopfer mehr // Override fuer die Stille-Schwelle — wenn gesetzt, wird die adaptive Baseline // ignoriert. Nuetzlich wenn die adaptive Logik in spezifischen Umgebungen // nicht zuverlaessig greift. Range -55..-15 dB. Speech-Schwelle wird auf // override+10 dB gesetzt (Speech muss klar lauter als Stille sein). export const VAD_SILENCE_DB_DEFAULT = -38; // wenn User Manuell-Modus waehlt export const VAD_SILENCE_DB_MIN = -85; // extrem empfindlich, praktisch alles gilt als Sprache export const VAD_SILENCE_DB_MAX = -15; // sehr unempfindlich, nur lautes Reden gilt export const VAD_SILENCE_DB_OVERRIDE_KEY = 'aria_vad_silence_db_override'; /** Liefert den manuellen Override-Wert oder null wenn "automatisch". */ export async function loadVadSilenceDbOverride(): Promise { try { const raw = await AsyncStorage.getItem(VAD_SILENCE_DB_OVERRIDE_KEY); if (raw == null || raw === '') return null; const n = parseFloat(raw); if (!isFinite(n)) return null; if (n < VAD_SILENCE_DB_MIN || n > VAD_SILENCE_DB_MAX) return null; return n; } catch { return null; } } // VAD-Stille (in Sekunden) — wie lange Sprechpause toleriert wird, bevor // die Aufnahme automatisch beendet wird. Einstellbar in den App-Settings. export const VAD_SILENCE_DEFAULT_SEC = 2.8; export const VAD_SILENCE_MIN_SEC = 1.0; export const VAD_SILENCE_MAX_SEC = 8.0; export const VAD_SILENCE_STORAGE_KEY = 'aria_vad_silence_sec'; // Konversations-Fenster (in Sekunden) — nach ARIA's Antwort hat der User so // lange Zeit, im Gespraechsmodus weiter zu sprechen, ohne dass die Konversation // beendet wird. Sprichst du im Fenster nichts → Konversation aus. export const CONV_WINDOW_DEFAULT_SEC = 8.0; export const CONV_WINDOW_MIN_SEC = 3.0; export const CONV_WINDOW_MAX_SEC = 20.0; export const CONV_WINDOW_STORAGE_KEY = 'aria_conv_window_sec'; // TTS-Wiedergabegeschwindigkeit — wird pro Geraet gespeichert und an die // Bridge mitgegeben (speed-Param im F5-TTS infer()). 1.0 = normal. export const TTS_SPEED_DEFAULT = 1.0; export const TTS_SPEED_MIN = 0.1; export const TTS_SPEED_MAX = 5.0; export const TTS_SPEED_STORAGE_KEY = 'aria_tts_speed'; export async function loadTtsSpeed(): Promise { try { const raw = await AsyncStorage.getItem(TTS_SPEED_STORAGE_KEY); if (raw != null) { const n = parseFloat(raw); if (isFinite(n) && n >= TTS_SPEED_MIN && n <= TTS_SPEED_MAX) return n; } } catch {} return TTS_SPEED_DEFAULT; } export async function loadConvWindowMs(): Promise { try { const raw = await AsyncStorage.getItem(CONV_WINDOW_STORAGE_KEY); if (raw != null) { const n = parseFloat(raw); if (isFinite(n) && n >= CONV_WINDOW_MIN_SEC && n <= CONV_WINDOW_MAX_SEC) { return Math.round(n * 1000); } } } catch {} return Math.round(CONV_WINDOW_DEFAULT_SEC * 1000); } async function loadVadSilenceMs(): Promise { try { const raw = await AsyncStorage.getItem(VAD_SILENCE_STORAGE_KEY); if (raw != null) { const n = parseFloat(raw); if (isFinite(n) && n >= VAD_SILENCE_MIN_SEC && n <= VAD_SILENCE_MAX_SEC) { return Math.round(n * 1000); } } } catch {} return Math.round(VAD_SILENCE_DEFAULT_SEC * 1000); } // Max-Dauer einer Aufnahme (Notbremse gegen Runaway-Loops). Auf 2 Minuten // hochgezogen damit auch laengere Erklaerungen durchgehen. // Default 5 Minuten — konfigurierbar in den App-Settings (1-30 Minuten). export const MAX_RECORDING_DEFAULT_SEC = 300; export const MAX_RECORDING_MIN_SEC = 60; export const MAX_RECORDING_MAX_SEC = 1800; export const MAX_RECORDING_STORAGE_KEY = 'aria_max_recording_sec'; export async function loadMaxRecordingMs(): Promise { try { const raw = await AsyncStorage.getItem(MAX_RECORDING_STORAGE_KEY); if (raw != null) { const n = parseFloat(raw); if (isFinite(n) && n >= MAX_RECORDING_MIN_SEC && n <= MAX_RECORDING_MAX_SEC) { return Math.round(n * 1000); } } } catch {} return MAX_RECORDING_DEFAULT_SEC * 1000; } // Pre-Roll: Wie lange Audio im AudioTrack-Buffer liegt bevor play() startet. // Einstellbar via Diagnostic/Settings (Key: aria_tts_preroll_sec). export const TTS_PREROLL_DEFAULT_SEC = 3.5; export const TTS_PREROLL_MIN_SEC = 0; // 0 = sofort abspielen (F5-TTS ist schnell genug) export const TTS_PREROLL_MAX_SEC = 6.0; export const TTS_PREROLL_STORAGE_KEY = 'aria_tts_preroll_sec'; async function loadPrerollSec(): Promise { try { const raw = await AsyncStorage.getItem(TTS_PREROLL_STORAGE_KEY); if (raw != null) { const n = parseFloat(raw); if (isFinite(n) && n >= TTS_PREROLL_MIN_SEC && n <= TTS_PREROLL_MAX_SEC) { return n; } } } catch {} return TTS_PREROLL_DEFAULT_SEC; } // --- Audio-Service --- class AudioService { private recordingState: RecordingState = 'idle'; private recordingStartTime: number = 0; private stateListeners: RecordingStateCallback[] = []; private meterListeners: MeterCallback[] = []; private silenceListeners: SilenceCallback[] = []; private currentSound: Sound | null = null; private recorder: AudioRecorderPlayer; private recordingPath: string = ''; // Audio-Queue fuer sequentielle TTS-Wiedergabe private audioQueue: string[] = []; private isPlaying: boolean = false; private preloadedSound: Sound | null = null; private preloadedPath: string = ''; // Sprach-Gate: Aufnahme erst senden wenn tatsaechlich gesprochen wurde private speechDetected: boolean = false; private speechStartTime: number = 0; // PCM-Stream (XTTS): aktive Session + Cache-Puffer pro messageId private pcmStreamActive: boolean = false; private pcmMessageId: string = ''; private pcmSampleRate: number = 24000; private pcmChannels: number = 1; private pcmBuffer: string[] = []; // base64-chunks zum spaeteren WAV-Build private pcmBytesCollected: number = 0; private readonly PCM_MAX_CACHE_BYTES = 30 * 1024 * 1024; // 30MB // AudioFocus wird verzoegert freigegeben — wenn ARIA eine zweite Antwort // direkt hinterherschickt (oder ein neuer Stream startet), bleibt Spotify // pausiert. Ohne diese Verzoegerung springt Spotify im Mikro-Sekunden-Gap // zwischen zwei Streams kurz wieder an. private focusReleaseTimer: ReturnType | null = null; private readonly FOCUS_RELEASE_DELAY_MS = 800; // Conversation-Mode: solange aktiv (Wake-Word Status 'conversing' ODER // wir wissen "ARIA spricht gerade in einem Multi-Turn-Dialog"), halten wir // den AudioFocus DAUERHAFT. Der per-Stream-Release wird unterdrueckt, // damit Spotify nicht in Render-Pausen oder zwischen Antworten zurueckkehrt. private _conversationFocusActive: boolean = false; // VAD State private vadEnabled: boolean = false; private lastSpeechTime: number = 0; private vadTimer: ReturnType | null = null; private maxDurationTimer: ReturnType | null = null; // Latch damit der Silence-Callback pro Aufnahme genau einmal feuert private silenceFired: boolean = false; private noSpeechTimer: ReturnType | null = null; // Adaptive Schwellen — werden in den ersten 500ms aus dem Mikro-Pegel // gemessen. baseline = avg dB der ersten 5 Samples, dann: // silence = baseline + VAD_SILENCE_OFFSET_DB (6dB ueber ambient) // speech = baseline + VAD_SPEECH_OFFSET_DB (12dB ueber ambient = klares Reden) // Funktioniert sowohl im stillen Buero als auch im lauten Cafe. private vadBaselineSamples: number[] = []; private vadAdaptiveSilenceDb: number = VAD_SILENCE_FALLBACK_DB; private vadAdaptiveSpeechDb: number = VAD_SPEECH_FALLBACK_DB; // Interruption-Tracking fuer Auto-Resume nach Anruf: // - playbackStartTime: ms-Timestamp wenn AudioTrack tatsaechlich anfing // abzuspielen (= _firePlaybackStarted) // - currentPlaybackMsgId: welche Antwort lief gerade // - pausedPosition / pausedMessageId: bei captureInterruption gemerkt private playbackStartTime: number = 0; private currentPlaybackMsgId: string = ''; private pausedPosition: number = 0; // Sekunden in der Audio-Datei private pausedMessageId: string = ''; private resumeSound: Sound | null = null; // halten damit GC nicht zuschlaegt // Leading-Silence wird im Native vor den Chunks geschrieben — beim // Position-Berechnen vom playbackStarted abziehen private readonly LEADING_SILENCE_SEC = 0.3; constructor() { this.recorder = new AudioRecorderPlayer(); this.recorder.setSubscriptionDuration(0.1); // 100ms Metering-Updates // Native Event: AudioTrack hat alle Samples wirklich durchgespielt (nach // dem finally{}-Block im Writer-Thread). ERST jetzt darf AudioFocus // freigegeben werden — sonst spielt Spotify schon waehrend ARIA noch // redet (PcmStreamPlayer.end() returnt mit 15s-Cap viel zu frueh). if (PcmStreamPlayer) { try { const emitter = new NativeEventEmitter(NativeModules.PcmStreamPlayer as any); emitter.addListener('PcmPlaybackFinished', () => { console.log('[Audio] PcmPlaybackFinished — Focus jetzt freigeben'); this._releaseFocusDeferred(); }); } catch (err) { console.warn('[Audio] PcmPlaybackFinished-Subscription fehlgeschlagen:', err); } } // App-Start: orphaned aria_tts_*.wav / aria_recording_*.mp4 aus dem Cache // wegraeumen. Sammeln sich an wenn Sound mid-playback gestoppt wird (Anruf, // Mute, Barge-In) — der completion-callback feuert dann nicht und die Datei // bleibt liegen. 5min-Threshold damit gerade aktiv geschriebene Files sicher // sind. cleanupOnStartup ist async, blockt den Constructor nicht. this._cleanupStaleCacheFiles(5 * 60 * 1000).catch(() => {}); } /** AudioFocus mit kleiner Verzoegerung freigeben — Spotify/YouTube * springen sonst im Gap zwischen zwei TTS-Streams (oder wenn ARIA * eine zweite Antwort direkt hinterherschickt) kurz wieder an. * Im Conversation-Mode (Wake-Word conversing) wird das Release komplett * unterdrueckt — der Focus bleibt fuer die ganze Konversation gehalten. */ private _releaseFocusDeferred(): void { if (this._conversationFocusActive) { console.log('[Audio] _releaseFocusDeferred: Conversation aktiv → kein Release'); this._cancelDeferredFocusRelease(); return; } this._cancelDeferredFocusRelease(); console.log('[Audio] _releaseFocusDeferred: in %dms', this.FOCUS_RELEASE_DELAY_MS); this.focusReleaseTimer = setTimeout(() => { this.focusReleaseTimer = null; if (this._conversationFocusActive) { console.log('[Audio] Focus-Release abgebrochen (Conversation jetzt aktiv)'); return; } console.log('[Audio] AudioFocus jetzt released'); AudioFocus?.release().catch(() => {}); }, this.FOCUS_RELEASE_DELAY_MS); } private _cancelDeferredFocusRelease(): void { if (this.focusReleaseTimer) { clearTimeout(this.focusReleaseTimer); this.focusReleaseTimer = null; } } /** Conversation-Mode beginnt → AudioFocus dauerhaft halten (Spotify bleibt * pausiert). Idempotent: mehrfaches Aufrufen ist sicher. */ acquireConversationFocus(): void { if (this._conversationFocusActive) return; this._conversationFocusActive = true; this._cancelDeferredFocusRelease(); console.log('[Audio] Conversation-Focus aktiv (Spotify bleibt gepaust)'); AudioFocus?.requestDuck().catch(() => {}); } /** Conversation-Mode endet → Focus darf wieder freigegeben werden * (verzoegert, damit eine direkt folgende Antwort nichts kaputtmacht). */ releaseConversationFocus(): void { if (!this._conversationFocusActive) return; this._conversationFocusActive = false; console.log('[Audio] Conversation-Focus inaktiv'); this._releaseFocusDeferred(); } /** TTS-Wiedergabe haart stoppen — z.B. fuer Barge-In. Buffer wird geleert, * kein Auto-Resume. Released auch sofort den AudioFocus. */ haltAllPlayback(reason: string = ''): void { console.log('[Audio] haltAllPlayback: %s', reason || '(no reason)'); this._conversationFocusActive = false; this.stopPlayback(); } /** Speziell fuer Anrufe: AudioTrack stoppen + Focus releasen, ABER pcm- * Buffer + messageId behalten damit weitere Chunks der unterbrochenen * Antwort weiter gesammelt werden. isFinal schreibt dann die WAV trotz * Anruf — und resumeFromInterruption findet sie. */ pauseForCall(reason: string = ''): void { console.log('[Audio] pauseForCall: %s', reason || '(no reason)'); this._conversationFocusActive = false; this._pausedForCall = true; // Queue + isPlaying ruecksetzen — sonst klemmt der naechste Play-Button // (playAudio sieht isPlaying=true und ruft _playNext nicht mehr auf). this.audioQueue = []; this.isPlaying = false; // Foreground-Service stoppen — Notification waere sonst irrefuehrend stopBackgroundAudio().catch(() => {}); // SoundPool/RNSound (Resume-Sound, Play-Button) stoppen — nicht relevant fuer Auto-Resume if (this.currentSound) { try { this.currentSound.stop(); this.currentSound.release(); } catch {} this.currentSound = null; } if (this.resumeSound) { try { this.resumeSound.stop(); this.resumeSound.release(); } catch {} this.resumeSound = null; } // AudioTrack hart stoppen damit nichts mehr aus dem Lautsprecher kommt. // pcmStreamActive bleibt true, pcmBuffer/pcmMessageId BLEIBEN — damit // weitere Chunks gesammelt werden und isFinal die WAV schreiben kann. PcmStreamPlayer?.stop().catch(() => {}); this._cancelDeferredFocusRelease(); AudioFocus?.release().catch(() => {}); } /** Anruf vorbei → weitere Chunks duerfen wieder abgespielt werden. * resumeFromInterruption uebernimmt die Wiedergabe ab gemerkter Position. */ endCallPause(): void { if (!this._pausedForCall) return; this._pausedForCall = false; console.log('[Audio] endCallPause'); } /** Bei Anruf: aktuelle Wiedergabe-Position merken damit wir nach dem * Auflegen von dort weitermachen koennen. Returnt Position in Sekunden * oder 0 wenn nichts spielte. * * Idempotent: bei mehrfachem Aufruf (ringing → offhook) wird die Position * vom ersten Mal NICHT ueberschrieben. playbackStartTime laeuft stumpf * weiter obwohl das Audio gestoppt ist — der erste Halt ist der echte. */ captureInterruption(): number { if (this.pausedMessageId) { console.log('[Audio] captureInterruption: bereits erfasst (msgId=%s pos=%ss) — skip', this.pausedMessageId, this.pausedPosition.toFixed(2)); return this.pausedPosition; } if (!this.playbackStartTime || !this.currentPlaybackMsgId) { console.log('[Audio] captureInterruption: nichts spielte (startTime=%s, msgId=%s)', this.playbackStartTime, this.currentPlaybackMsgId || '(leer)'); this.pausedPosition = 0; this.pausedMessageId = ''; return 0; } const elapsedMs = Date.now() - this.playbackStartTime; const positionSec = Math.max(0, elapsedMs / 1000 - this.LEADING_SILENCE_SEC); this.pausedPosition = positionSec; this.pausedMessageId = this.currentPlaybackMsgId; console.log('[Audio] captureInterruption: msgId=%s pos=%ss', this.pausedMessageId, positionSec.toFixed(2)); return positionSec; } /** Nach Anruf-Ende: ab gemerkter Position weiterspielen. Wenn Cache noch * nicht geschrieben (final kam waehrend Anruf vielleicht doch nicht), * warten bis maxWaitMs und dann probieren. Returnt true wenn gestartet. */ async resumeFromInterruption(maxWaitMs: number = 30000): Promise { const msgId = this.pausedMessageId; const position = this.pausedPosition; if (!msgId) { console.log('[Audio] resumeFromInterruption: kein gemerkter Stand — skip'); return false; } console.log('[Audio] resumeFromInterruption: starte fuer msgId=%s pos=%ss', msgId, position.toFixed(2)); this.pausedMessageId = ''; // konsumieren const cachePath = `${RNFS.DocumentDirectoryPath}/tts_cache/${msgId}.wav`; const startTime = Date.now(); while (Date.now() - startTime < maxWaitMs) { try { if (await RNFS.exists(cachePath)) { return await this._playFromPathAtPosition(cachePath, position); } } catch {} await new Promise(r => setTimeout(r, 500)); } console.warn('[Audio] resumeFromInterruption: WAV %s nicht binnen %dms verfuegbar', msgId, maxWaitMs); return false; } private async _playFromPathAtPosition(path: string, positionSec: number): Promise { try { // Bestehende laufende Wiedergabe abbrechen damit wir sauber starten if (this.resumeSound) { try { this.resumeSound.stop(); this.resumeSound.release(); } catch {} this.resumeSound = null; } const sound = await new Promise((resolve, reject) => { const s = new Sound(path.replace(/^file:\/\//, ''), '', (err) => err ? reject(err) : resolve(s)); }); // Audio-Focus anfordern damit Spotify pausiert this._cancelDeferredFocusRelease(); AudioFocus?.requestDuck().catch(() => {}); this._firePlaybackStarted(); this.isPlaying = true; this.resumeSound = sound; // Tracking auch fuer den Resume-Sound aktualisieren — sonst kann // captureInterruption bei einem zweiten Anruf die Position nicht // mehr ermitteln (playbackStartTime waere von der ersten Wiedergabe). const msgIdMatch = path.match(/([^/\\]+)\.wav$/i); if (msgIdMatch) this.currentPlaybackMsgId = msgIdMatch[1]; // Virtuelle Start-Zeit so setzen, dass captureInterruption (das den // Leading-Silence-Offset wieder abzieht) die korrekte Position liefert. this.playbackStartTime = Date.now() - (positionSec + this.LEADING_SILENCE_SEC) * 1000; console.log('[Audio] Resume von Position %ss aus %s', positionSec.toFixed(2), path); sound.setCurrentTime(Math.max(0, positionSec)); sound.play((success) => { if (!success) console.warn('[Audio] Resume-Wiedergabe fehlgeschlagen'); try { sound.release(); } catch {} if (this.resumeSound === sound) this.resumeSound = null; this.isPlaying = false; this.playbackFinishedListeners.forEach(cb => { try { cb(); } catch (e) { console.warn('[Audio] cb err:', e); } }); this._releaseFocusDeferred(); }); return true; } catch (err: any) { console.warn('[Audio] _playFromPathAtPosition fehlgeschlagen:', err?.message || err); return false; } } /** True wenn ARIA gerade was abspielt — egal ob WAV-Queue oder PCM-Stream. * Nuetzlich fuer "Barge-In": wenn der User spricht waehrend ARIA spricht, * soll die ARIA-Wiedergabe abgebrochen + die neue User-Message verarbeitet * werden ("ach vergiss es, mach lieber X"). */ isPlayingAudio(): boolean { return this.isPlaying || this.pcmStreamActive; } // --- Berechtigungen --- async requestMicrophonePermission(): Promise { if (Platform.OS !== 'android') { return true; } try { const granted = await PermissionsAndroid.request( PermissionsAndroid.PERMISSIONS.RECORD_AUDIO, { title: 'ARIA Cockpit - Mikrofon', message: 'ARIA benoetigt Zugriff auf das Mikrofon fuer Spracheingabe.', buttonPositive: 'Erlauben', buttonNegative: 'Ablehnen', }, ); return granted === PermissionsAndroid.RESULTS.GRANTED; } catch (err) { console.error('[Audio] Fehler bei Berechtigungsanfrage:', err); return false; } } // --- Aufnahme --- /** Mikrofon-Aufnahme starten. * * @param autoStop VAD aktivieren — Auto-Stop bei Stille * @param noSpeechTimeoutMs Wenn der User innerhalb dieser Zeit nichts sagt, * wird Stille gemeldet (Recording wird verworfen). * Fuer Conversation-Window: nach ARIA's Antwort * hast du nur N Sekunden um anzufangen, sonst * Gespraech zu Ende. */ async startRecording(autoStop: boolean = false, noSpeechTimeoutMs: number = 0): Promise { if (this.recordingState !== 'idle') { console.warn('[Audio] Aufnahme laeuft bereits'); return false; } const hasPermission = await this.requestMicrophonePermission(); if (!hasPermission) { console.warn('[Audio] Keine Mikrofon-Berechtigung'); return false; } try { // Laufende Wiedergabe stoppen (damit ARIA sich nicht selbst hoert) this.stopPlayback(); // Aufraeumen: Alte aria_recording_ und aria_tts_ Files loeschen // (Schutz gegen Cache-Ueberlauf im Gespraechsmodus bei vielen Zyklen) this._cleanupStaleCacheFiles().catch(() => {}); this.recordingPath = `${RNFS.CachesDirectoryPath}/aria_recording_${Date.now()}.mp4`; // Foreground-Service VOR dem AudioRecord starten — sonst blockt Android // den Background-Mic-Zugriff (foregroundServiceType=microphone muss zum // Zeitpunkt des startRecorder() schon aktiv sein, sonst greifen die // Background-Mic-Restrictions ab Android 11+). await acquireBackgroundAudio('rec'); // Aufnahme mit Metering starten await this.recorder.startRecorder(this.recordingPath, { AudioEncoderAndroid: AudioEncoderAndroidType.AAC, AudioSourceAndroid: AudioSourceAndroidType.MIC, OutputFormatAndroid: OutputFormatAndroidType.MPEG_4, AudioSamplingRateAndroid: 16000, AudioChannelsAndroid: 1, }, true); // meteringEnabled = true // Metering-Callback this.recorder.addRecordBackListener((e) => { const db = e.currentMetering ?? -160; this.meterListeners.forEach(cb => cb(db)); // Adaptive Baseline: erste 5 Samples (~500ms) sammeln, dann Schwellen // anpassen. -160 (kein Metering) ignorieren — sonst wird die Baseline // sinnlos niedrig. if (this.vadBaselineSamples.length < VAD_BASELINE_SAMPLES) { if (db > -100) { this.vadBaselineSamples.push(db); if (this.vadBaselineSamples.length === VAD_BASELINE_SAMPLES) { // Minimum statt Mittelwert: robust gegen Spike-Samples (z.B. wenn // der User direkt nach Wake-Word sofort spricht oder das Wake-Word- // Echo noch im Mikro ist). Min ist der ruhigste Moment. const lowest = Math.min(...this.vadBaselineSamples); const rawSilence = lowest + VAD_SILENCE_OFFSET_DB; const rawSpeech = lowest + VAD_SPEECH_OFFSET_DB; // Cap auf einen vernuenftigen Bereich: // - Silence-Schwelle nicht ueber -28dB (sonst zaehlt Hintergrund- // geraeusch dauerhaft als "Sprache" → VAD feuert nie) // - Silence-Schwelle nicht unter -50dB (sonst zu strikt) this.vadAdaptiveSilenceDb = Math.max(-50, Math.min(rawSilence, -28)); this.vadAdaptiveSpeechDb = Math.max(-40, Math.min(rawSpeech, -18)); const msg = `VAD: ambient=${lowest.toFixed(0)}dB stille>${this.vadAdaptiveSilenceDb.toFixed(0)}dB`; console.log('[Audio] %s speech>%s (raw silence=%s speech=%s)', msg, this.vadAdaptiveSpeechDb.toFixed(1), rawSilence.toFixed(1), rawSpeech.toFixed(1)); try { ToastAndroid.show(msg, ToastAndroid.SHORT); } catch {} } } } // Sprach-Gate: Erkennen ob tatsaechlich gesprochen wird if (db > this.vadAdaptiveSpeechDb) { if (!this.speechDetected && this.speechStartTime === 0) { this.speechStartTime = Date.now(); } if (this.speechStartTime > 0 && Date.now() - this.speechStartTime >= VAD_SPEECH_MIN_MS) { this.speechDetected = true; } } else { if (!this.speechDetected) { this.speechStartTime = 0; // Reset wenn noch nicht als Sprache erkannt } } // VAD: Stille erkennen (nur wenn Sprache erkannt wurde) if (this.vadEnabled) { if (db > this.vadAdaptiveSilenceDb) { this.lastSpeechTime = Date.now(); } } }); this.recordingStartTime = Date.now(); this.lastSpeechTime = Date.now(); this.speechDetected = false; this.speechStartTime = 0; // VAD-Adaptive zurueckgesetzt: Baseline wird in den ersten 500ms neu // gemessen. Bis dahin gelten die Fallback-Schwellen. this.vadBaselineSamples = []; this.vadAdaptiveSilenceDb = VAD_SILENCE_FALLBACK_DB; this.vadAdaptiveSpeechDb = VAD_SPEECH_FALLBACK_DB; // Manueller Override aus Settings — wenn gesetzt, wird die adaptive // Baseline-Messung uebersteuert. User-Wahl gewinnt vor Auto-Magic. const dbOverride = await loadVadSilenceDbOverride(); if (dbOverride != null) { this.vadAdaptiveSilenceDb = dbOverride; this.vadAdaptiveSpeechDb = dbOverride + 10; // Speech klar ueber Stille this.vadBaselineSamples = new Array(VAD_BASELINE_SAMPLES).fill(0); // Baseline-Sammeln deaktivieren const msg = `VAD: manuell stille>${dbOverride}dB`; console.log('[Audio] %s', msg); try { ToastAndroid.show(msg, ToastAndroid.SHORT); } catch {} } this.setState('recording'); // Andere Apps waehrend der Aufnahme pausieren (Musik, Videos etc.) this._cancelDeferredFocusRelease(); AudioFocus?.requestExclusive().catch(() => {}); // VAD aktivieren — Stille-Dauer aus AsyncStorage (Settings-konfigurierbar). // WICHTIG: jeder Trigger (VAD-Stille / Max-Dauer / No-Speech-Window) // disable SOFORT den VAD-Flag und clear den Timer, BEVOR die Listener // gefeuert werden. Sonst feuert das setInterval weiter alle 200ms und // ruft stopRecording parallel auf → audio-recorder-player crasht. this.vadEnabled = autoStop; this.silenceFired = false; const fireSilenceOnce = (reason: string) => { if (this.silenceFired) return; this.silenceFired = true; this.vadEnabled = false; if (this.vadTimer) { clearInterval(this.vadTimer); this.vadTimer = null; } if (this.maxDurationTimer) { clearTimeout(this.maxDurationTimer); this.maxDurationTimer = null; } if (this.noSpeechTimer) { clearTimeout(this.noSpeechTimer); this.noSpeechTimer = null; } console.log('[Audio] Silence-Fire: %s', reason); this.silenceListeners.forEach(cb => { try { cb(); } catch (e) { console.warn('[Audio] silence listener err:', e); } }); }; if (autoStop) { const vadSilenceMs = await loadVadSilenceMs(); const maxRecordingMs = await loadMaxRecordingMs(); console.log('[Audio] startRecording: autoStop=true, VAD-Stille=%dms, MAX=%dms', vadSilenceMs, maxRecordingMs); this.vadTimer = setInterval(() => { const silenceDuration = Date.now() - this.lastSpeechTime; if (silenceDuration >= vadSilenceMs) { fireSilenceOnce(`VAD ${silenceDuration}ms Stille (Schwelle=${vadSilenceMs}ms)`); } }, 200); // Notbremse: Nach maxRecordingMs zwangsweise stoppen this.maxDurationTimer = setTimeout(() => { fireSilenceOnce(`Max-Dauer ${maxRecordingMs}ms`); }, maxRecordingMs); } // Conversation-Window: Wenn der User innerhalb noSpeechTimeoutMs nicht // anfaengt zu sprechen → Aufnahme abbrechen (Speech-Gate verwirft sie). if (noSpeechTimeoutMs > 0) { this.noSpeechTimer = setTimeout(() => { if (!this.speechDetected && this.recordingState === 'recording') { fireSilenceOnce(`Conversation-Window ${noSpeechTimeoutMs}ms ohne Sprache`); } }, noSpeechTimeoutMs); } console.log('[Audio] Aufnahme gestartet (autoStop: %s)', autoStop); return true; } catch (err) { console.error('[Audio] Fehler beim Starten der Aufnahme:', err); this.setState('idle'); return false; } } /** Aufnahme stoppen und Ergebnis zurueckgeben */ async stopRecording(): Promise { if (this.recordingState !== 'recording') { console.warn('[Audio] Keine aktive Aufnahme'); return null; } this.setState('processing'); this.vadEnabled = false; if (this.vadTimer) { clearInterval(this.vadTimer); this.vadTimer = null; } if (this.maxDurationTimer) { clearTimeout(this.maxDurationTimer); this.maxDurationTimer = null; } if (this.noSpeechTimer) { clearTimeout(this.noSpeechTimer); this.noSpeechTimer = null; } try { await this.recorder.stopRecorder(); this.recorder.removeRecordBackListener(); // Audio-Focus verzoegert freigeben — gleich kommt die TTS-Antwort, // im Gap soll Spotify nicht hochkommen. this._releaseFocusDeferred(); const durationMs = Date.now() - this.recordingStartTime; const hadSpeech = this.speechDetected; // Sprach-Gate: Wenn keine Sprache erkannt → Aufnahme verwerfen if (!hadSpeech) { RNFS.unlink(this.recordingPath).catch(() => {}); this.setState('idle'); console.log('[Audio] Aufnahme verworfen — keine Sprache erkannt (nur Umgebungsgeraeusche)'); return null; } // Audio-Datei als Base64 lesen const base64Data = await RNFS.readFile(this.recordingPath, 'base64'); // Temp-Datei aufraeumen RNFS.unlink(this.recordingPath).catch(() => {}); this.setState('idle'); console.log(`[Audio] Aufnahme beendet (${durationMs}ms, ${Math.round(base64Data.length / 1024)}KB, Sprache erkannt)`); return { base64: base64Data, durationMs, mimeType: 'audio/mp4', // AAC in MP4 Container }; } catch (err) { console.error('[Audio] Fehler beim Stoppen der Aufnahme:', err); this.setState('idle'); return null; } } // --- Wiedergabe --- /** Base64-kodiertes Audio in die Queue stellen und abspielen */ async playAudio(base64Data: string): Promise { if (!base64Data) return; // Mute-Flag respektieren — robust gegen Race-Conditions zwischen User- // Klick auf Mute und einem TTS-Chunk der im selben Tick eintrifft. if (this._muted) { console.log('[Audio] playAudio: muted=true → skip'); return; } this.audioQueue.push(base64Data); console.log('[Audio] playAudio: queued (queue=%d isPlaying=%s pausedForCall=%s)', this.audioQueue.length, this.isPlaying, this._pausedForCall); if (!this.isPlaying) { this._playNext(); } } /** Base64-Audio persistent speichern. Gibt file:// Pfad zurueck (oder leer bei Fehler). */ async cacheAudio(base64Data: string, messageId: string): Promise { if (!base64Data || !messageId) return ''; try { const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`; await RNFS.mkdir(dir).catch(() => {}); const path = `${dir}/${messageId}.wav`; // Wenn Datei schon existiert (z.B. XTTS Chunks) → anhaengen statt ueberschreiben const exists = await RNFS.exists(path); if (exists) { // Bestehende + neue Base64 laden, zusammenkleben (fuer jetzt: ueberschreiben) // XTTS sendet mehrere Chunks — bei mehrfacher Ueberschreibung bleibt nur der letzte // Fuer eine echte Konkatenation muesste WAV-Header gemerged werden await RNFS.writeFile(path, base64Data, 'base64'); } else { await RNFS.writeFile(path, base64Data, 'base64'); } return `file://${path}`; } catch (err) { console.warn('[Audio] cacheAudio fehlgeschlagen:', err); return ''; } } /** Einen PCM-Chunk aus einer audio_pcm Nachricht empfangen. * silent=true → nur cachen, nicht abspielen (z.B. wenn TTS geraetelokal gemutet). * Gibt bei final=true den Cache-Pfad zurueck (file://) oder '' wenn nicht gecached. * * Wrapper serialisiert aufeinanderfolgende Chunk-Calls via Promise-Queue — * sonst gabs bei kurzen Streams einen Race: final-Chunk konnte `end()` rufen * BEVOR der vorherige `start()` im Native-Modul fertig war. Der Writer- * Thread sah dann endRequested=true ohne jemals Chunks zu verarbeiten. */ private _pcmChunkQueue: Promise = Promise.resolve(); async handlePcmChunk(payload: { base64: string; sampleRate?: number; channels?: number; messageId?: string; chunk?: number; final?: boolean; silent?: boolean; }): Promise { const p = this._pcmChunkQueue.then(() => this._handlePcmChunkImpl(payload)).catch(err => { console.warn('[Audio] handlePcmChunk queued err:', err); return ''; }); // Chain only on the side effect — callers still get the per-call result this._pcmChunkQueue = p; return p; } private async _handlePcmChunkImpl(payload: { base64: string; sampleRate?: number; channels?: number; messageId?: string; chunk?: number; final?: boolean; silent?: boolean; }): Promise { // _stoppedMessageId: User hat diese Antwort mid-Wiedergabe gestoppt // (Mute geklickt). Auch wenn Mute jetzt wieder aus ist, soll diese // Antwort nicht weiterspielen. Erst eine neue messageId resetted das. const incomingMsgId = payload.messageId || ''; const stoppedByUser = !!this._stoppedMessageId && incomingMsgId === this._stoppedMessageId; // Globaler Mute-Flag uebersteuert das per-Call silent — verhindert // Race-Conditions wenn der User zwischen Chunks den Mute-Knopf drueckt. // _pausedForCall: AudioTrack ist gestoppt waehrend Anruf — Chunks weiter // sammeln (fuer WAV-Cache), aber NICHT in den Player schicken. const silent = !!payload.silent || this._muted || this._pausedForCall || stoppedByUser; if (!silent && !PcmStreamPlayer) { console.warn('[Audio] PcmStreamPlayer Native Module nicht verfuegbar'); return ''; } // Debug-Log bei Chunk 0 eines neuen Streams — damit man im adb logcat // sieht warum der Auto-Playback greift oder nicht. if ((payload.chunk ?? 0) === 0 && !this.pcmStreamActive) { console.log('[Audio] PCM-Stream start: silent=%s messageId=%s sr=%s ch=%s', silent, payload.messageId || '(none)', payload.sampleRate, payload.channels); } const messageId = payload.messageId || ''; const sampleRate = payload.sampleRate || 24000; const channels = payload.channels || 1; const base64 = payload.base64 || ''; const isFinal = !!payload.final; // Neuer Stream? (messageId Wechsel oder nicht aktiv) if (!this.pcmStreamActive || this.pcmMessageId !== messageId) { if (this.pcmStreamActive && !silent) { try { await PcmStreamPlayer!.stop(); } catch {} this.pcmBuffer = []; this.pcmBytesCollected = 0; } // Resume-Sound stoppen falls noch aktiv (User hat nach Anruf eine // neue Frage gestellt — die alte interruptierte Antwort ist obsolet). if (this.resumeSound) { try { this.resumeSound.stop(); this.resumeSound.release(); } catch {} this.resumeSound = null; } // Pending Auto-Resume verwerfen wenn die neue Antwort eine andere // messageId hat. Sonst spielt nach 30s-Wartezeit der Resume die // ueberholte Antwort ab. if (this.pausedMessageId && this.pausedMessageId !== messageId) { console.log('[Audio] Neue TTS-Antwort (msgId=%s) — Auto-Resume fuer %s verworfen', messageId, this.pausedMessageId); this.pausedMessageId = ''; this.pausedPosition = 0; } // Stop-Marker zuruecksetzen wenn neue messageId — neue Antwort darf // wieder normal abspielen, egal ob Mute zwischendurch aktiv war. if (this._stoppedMessageId && this._stoppedMessageId !== messageId) { console.log('[Audio] Neue Antwort (msgId=%s) — Stop-Marker fuer %s zurueckgesetzt', messageId, this._stoppedMessageId); this._stoppedMessageId = ''; } this.pcmStreamActive = true; this.pcmMessageId = messageId; this.pcmSampleRate = sampleRate; this.pcmChannels = channels; this.pcmBuffer = []; this.pcmBytesCollected = 0; if (!silent) { const prerollSec = await loadPrerollSec(); try { await PcmStreamPlayer!.start(sampleRate, channels, prerollSec); } catch (err) { console.error('[Audio] PcmStreamPlayer.start fehlgeschlagen:', err); this.pcmStreamActive = false; return ''; } this._cancelDeferredFocusRelease(); AudioFocus?.requestDuck().catch(() => {}); this._firePlaybackStarted(); } } // Chunk — immer cachen, nur bei !silent auch abspielen if (base64) { if (!silent) { try { await PcmStreamPlayer!.writeChunk(base64); } catch (err) { console.warn('[Audio] writeChunk', err); } } if (messageId && this.pcmBytesCollected < this.PCM_MAX_CACHE_BYTES) { this.pcmBuffer.push(base64); this.pcmBytesCollected += Math.floor(base64.length * 0.75); } } if (isFinal) { if (!silent) { // end() signalisiert dem Writer "keine weiteren Chunks". Aber WIR // releasen den AudioFocus NICHT hier — der writer braucht u.U. noch // 30+ Sekunden bis der Buffer wirklich abgespielt ist. Den release // triggert das native Event "PcmPlaybackFinished" wenn AudioTrack // wirklich am Ende ist (siehe ensurePlaybackFinishedListener). try { await PcmStreamPlayer!.end(); } catch {} // playbackFinished-Listener informieren (UI-Logik) this.playbackFinishedListeners.forEach(cb => { try { cb(); } catch (e) { console.warn('[Audio] playbackFinished cb err:', e); } }); } this.pcmStreamActive = false; if (messageId && this.pcmBuffer.length > 0) { const audioPath = await this._savePcmBufferAsWav(messageId); this.pcmBuffer = []; this.pcmBytesCollected = 0; this.pcmMessageId = ''; return audioPath; } this.pcmMessageId = ''; } return ''; } /** Gesammelte PCM-Chunks als WAV speichern. Gibt file:// Pfad zurueck. */ private async _savePcmBufferAsWav(messageId: string): Promise { try { const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`; await RNFS.mkdir(dir).catch(() => {}); const path = `${dir}/${messageId}.wav`; // WAV-Header fuer PCM s16le const sampleRate = this.pcmSampleRate; const channels = this.pcmChannels; const bitsPerSample = 16; const byteRate = sampleRate * channels * bitsPerSample / 8; const blockAlign = channels * bitsPerSample / 8; const dataSize = this.pcmBytesCollected; const fileSize = 36 + dataSize; // Header als Base64 (44 bytes) const header = new Uint8Array(44); const dv = new DataView(header.buffer); // "RIFF" header[0] = 0x52; header[1] = 0x49; header[2] = 0x46; header[3] = 0x46; dv.setUint32(4, fileSize, true); // "WAVE" header[8] = 0x57; header[9] = 0x41; header[10] = 0x56; header[11] = 0x45; // "fmt " header[12] = 0x66; header[13] = 0x6d; header[14] = 0x74; header[15] = 0x20; dv.setUint32(16, 16, true); // fmt chunk size dv.setUint16(20, 1, true); // PCM format dv.setUint16(22, channels, true); dv.setUint32(24, sampleRate, true); dv.setUint32(28, byteRate, true); dv.setUint16(32, blockAlign, true); dv.setUint16(34, bitsPerSample, true); // "data" header[36] = 0x64; header[37] = 0x61; header[38] = 0x74; header[39] = 0x61; dv.setUint32(40, dataSize, true); // Header als base64 let headerB64 = ''; const chunk = 1024; for (let i = 0; i < header.length; i += chunk) { headerB64 += String.fromCharCode(...Array.from(header.slice(i, i + chunk))); } headerB64 = btoaSafe(headerB64); // Datei schreiben: Header + alle PCM-Chunks await RNFS.writeFile(path, headerB64, 'base64'); for (const b64 of this.pcmBuffer) { await RNFS.appendFile(path, b64, 'base64'); } console.log(`[Audio] PCM-Cache geschrieben: ${path} (${(dataSize / 1024).toFixed(0)}KB, ${this.pcmBuffer.length} chunks)`); return `file://${path}`; } catch (err) { console.warn('[Audio] _savePcmBufferAsWav fehlgeschlagen:', err); return ''; } } /** Audio aus lokaler Datei (file:// Pfad) in die Queue und abspielen. * Setzt zusaetzlich playbackStartTime + currentPlaybackMsgId damit ein * Anruf waehrend dieses Playbacks korrekt erfasst wird (ohne dieses * Tracking liefert captureInterruption nichts → kein Auto-Resume). */ async playFromPath(filePath: string): Promise { if (!filePath) return; try { const cleanPath = filePath.replace(/^file:\/\//, ''); if (!(await RNFS.exists(cleanPath))) { console.warn('[Audio] Cache-Datei existiert nicht mehr:', cleanPath); return; } // Dateiname ohne .wav als messageId nehmen (egal ob UUID oder andere ID) const fileMatch = cleanPath.match(/([^/\\]+)\.wav$/i); const msgId = fileMatch ? fileMatch[1] : ''; console.log('[Audio] playFromPath: cleanPath=%s → msgId=%s', cleanPath, msgId || '(leer)'); if (msgId) { this.currentPlaybackMsgId = msgId; this.playbackStartTime = Date.now() - this.LEADING_SILENCE_SEC * 1000; } const b64 = await RNFS.readFile(cleanPath, 'base64'); this.playAudio(b64); } catch (err) { console.warn('[Audio] playFromPath fehlgeschlagen:', err); } } // Callback wenn alle Audio-Teile abgespielt sind private playbackFinishedListeners: (() => void)[] = []; private playbackStartedListeners: (() => void)[] = []; onPlaybackFinished(callback: () => void): () => void { this.playbackFinishedListeners.push(callback); return () => { this.playbackFinishedListeners = this.playbackFinishedListeners.filter(cb => cb !== callback); }; } /** Callback wenn ARIAs TTS-Wiedergabe startet — fuer Wake-Word-parallel- * Listening waehrend ARIA spricht (Barge-In via "Computer" sagen). */ onPlaybackStarted(callback: () => void): () => void { this.playbackStartedListeners.push(callback); return () => { this.playbackStartedListeners = this.playbackStartedListeners.filter(cb => cb !== callback); }; } private _firePlaybackStarted(): void { // Tracking fuer Auto-Resume nach Anruf-Pause: NUR setzen wenn ein // PCM-Stream laeuft (Live-TTS). Bei Play-Button / Resume-Sound hat der // Caller (playFromPath / _playFromPathAtPosition) das Tracking schon // korrekt mit der msgId aus dem Pfad gesetzt — sonst wuerden wir hier // mit leerem pcmMessageId ueberschreiben. if (this.pcmMessageId) { this.playbackStartTime = Date.now(); this.currentPlaybackMsgId = this.pcmMessageId; } this.playbackStartedListeners.forEach(cb => { try { cb(); } catch (e) { console.warn('[Audio] playbackStarted listener err:', e); } }); } /** Naechstes Audio aus der Queue abspielen */ private async _playNext(): Promise { if (this.audioQueue.length === 0) { this.isPlaying = false; // Audio-Focus verzoegert abgeben → wenn gleich noch eine Antwort kommt, // bleibt Spotify pausiert. this._releaseFocusDeferred(); // Alle Audio-Teile abgespielt → Listener benachrichtigen this.playbackFinishedListeners.forEach(cb => cb()); return; } // Beim ersten Playback-Start: andere Apps ducken + Listener informieren if (!this.isPlaying) { this._cancelDeferredFocusRelease(); AudioFocus?.requestDuck().catch(() => {}); this._firePlaybackStarted(); } this.isPlaying = true; // Preloaded Sound verwenden wenn verfuegbar, sonst neu laden let sound: Sound; let soundPath: string; if (this.preloadedSound) { sound = this.preloadedSound; soundPath = this.preloadedPath; this.preloadedSound = null; this.preloadedPath = ''; // Daten aus Queue entfernen (wurde schon preloaded) this.audioQueue.shift(); } else { const base64Data = this.audioQueue.shift()!; try { soundPath = `${RNFS.CachesDirectoryPath}/aria_tts_${Date.now()}.wav`; await RNFS.writeFile(soundPath, base64Data, 'base64'); sound = await new Promise((resolve, reject) => { const s = new Sound(soundPath, '', (err) => err ? reject(err) : resolve(s)); }); } catch (err) { console.error('[Audio] Laden fehlgeschlagen:', err); this._playNext(); return; } } this.currentSound = sound; console.log('[Audio] Sound.play startet (path=%s)', soundPath); // Naechstes Audio schon vorbereiten waehrend dieses abspielt this._preloadNext(); sound.play((success) => { console.log('[Audio] Sound.play callback: success=%s queue=%d', success, this.audioQueue.length); if (!success) console.warn('[Audio] Wiedergabe fehlgeschlagen'); sound.release(); this.currentSound = null; RNFS.unlink(soundPath).catch(() => {}); this._playNext(); }); } /** Naechstes Audio im Hintergrund vorladen (verhindert Stottern) */ private async _preloadNext(): Promise { if (this.audioQueue.length === 0 || this.preloadedSound) return; const base64Data = this.audioQueue[0]; // Nicht shift — bleibt in Queue try { const tmpPath = `${RNFS.CachesDirectoryPath}/aria_tts_pre_${Date.now()}.wav`; await RNFS.writeFile(tmpPath, base64Data, 'base64'); this.preloadedSound = await new Promise((resolve, reject) => { const s = new Sound(tmpPath, '', (err) => err ? reject(err) : resolve(s)); }); this.preloadedPath = tmpPath; } catch { this.preloadedSound = null; this.preloadedPath = ''; } } /** Mute: alle eingehenden TTS-Chunks/WAVs werden ignoriert bis wieder * unmuted. Robuster als ein React-Ref weil hier kein Re-Render-Race ist * — die Bridge kann einen Chunk im selben JS-Tick liefern in dem der * User Mute geklickt hat. */ private _muted: boolean = false; /** Anruf laeuft → Chunks werden nur in den Cache-Buffer gepusht, nicht * abgespielt. Wird in pauseForCall gesetzt, in endCallPause/resumeFrom- * Interruption zurueckgenommen. */ private _pausedForCall: boolean = false; /** Wenn der User mid-Wiedergabe Mute drueckt: messageId der ABGEBROCHENEN * Antwort merken. Folge-Chunks dieser msgId werden silent ignoriert, auch * wenn der User Mute wieder ausschaltet — kein "Resume mid-Antwort". Eine * NEUE messageId resetted das, dann spielt's wieder normal. */ private _stoppedMessageId: string = ''; setMuted(muted: boolean): void { console.log('[Audio] setMuted: %s (currentSound=%s pcmStreamActive=%s)', muted, this.currentSound ? 'aktiv' : 'null', this.pcmStreamActive); this._muted = muted; if (muted) { // Aktuell laufende Antwort als "verworfen" markieren — nachfolgende // chunks dieser msgId werden silent gehalten auch wenn der User Mute // gleich wieder ausschaltet. Erst eine NEUE Antwort darf wieder reden. const activeMsgId = this.pcmMessageId || this.currentPlaybackMsgId; if (activeMsgId) { this._stoppedMessageId = activeMsgId; console.log('[Audio] Antwort %s als gestoppt markiert', activeMsgId); } this.stopPlayback(); } } isMuted(): boolean { return this._muted; } /** Laufende Wiedergabe stoppen + Queue leeren */ stopPlayback(): void { // Idempotent: wenn nichts mehr aktiv ist, NICHT noch einen Focus-Release/ // Kick-Cycle anstossen — Re-Renders triggern setMuted oft mehrfach hinter- // einander, und jeder weitere Kick lässt Spotify nochmal kurz pausieren. const hasAnything = !!(this.currentSound || this.resumeSound || this.preloadedSound || this.pcmStreamActive || this.audioQueue.length || this.isPlaying); if (!hasAnything) return; console.log('[Audio] stopPlayback: currentSound=%s queue=%d pcm=%s', this.currentSound ? 'aktiv' : 'null', this.audioQueue.length, this.pcmStreamActive); // Foreground-Service auch stoppen — sonst bleibt die Notification haengen // wenn Wiedergabe abgebrochen wird (Anruf, Cancel, Barge-In). stopBackgroundAudio().catch(() => {}); this.audioQueue = []; this.isPlaying = false; if (this.currentSound) { this.currentSound.stop(); this.currentSound.release(); this.currentSound = null; } if (this.resumeSound) { this.resumeSound.stop(); this.resumeSound.release(); this.resumeSound = null; } if (this.preloadedSound) { this.preloadedSound.release(); this.preloadedSound = null; if (this.preloadedPath) RNFS.unlink(this.preloadedPath).catch(() => {}); this.preloadedPath = ''; } // PCM-Stream ebenfalls hart stoppen (Cancel/Abbruch). // pcmStreamActive wird beim isFinal-Chunk schon false gesetzt — der // AudioTrack spielt aber noch sekundenlang aus seinem Buffer ab. Daher // IMMER stop() aufrufen, ohne den Flag zu pruefen (ist idempotent). PcmStreamPlayer?.stop().catch(() => {}); this.pcmStreamActive = false; this.pcmBuffer = []; this.pcmBytesCollected = 0; this.pcmMessageId = ''; // Audio-Focus sofort freigeben — User hat explizit abgebrochen. // Unser Focus war TRANSIENT, Spotify resumed darum automatisch beim // Abandon. Den frueheren kickReleaseMedia haben wir entfernt: er // requestete USAGE_MEDIA mit GAIN (permanent), was Spotify als // "user-action stopp" interpretierte und Auto-Resume verhinderte. this._cancelDeferredFocusRelease(); AudioFocus?.release().catch(() => {}); } // --- Status & Callbacks --- getRecordingState(): RecordingState { return this.recordingState; } /** Callback fuer Aufnahmestatus-Aenderungen */ onStateChange(callback: RecordingStateCallback): () => void { this.stateListeners.push(callback); return () => { this.stateListeners = this.stateListeners.filter(cb => cb !== callback); }; } /** Callback fuer Metering-Updates (dB Werte waehrend Aufnahme) */ onMeterUpdate(callback: MeterCallback): () => void { this.meterListeners.push(callback); return () => { this.meterListeners = this.meterListeners.filter(cb => cb !== callback); }; } /** Callback wenn VAD Stille erkennt (Auto-Stop) */ onSilenceDetected(callback: SilenceCallback): () => void { this.silenceListeners.push(callback); return () => { this.silenceListeners = this.silenceListeners.filter(cb => cb !== callback); }; } private setState(state: RecordingState): void { if (this.recordingState !== state) { this.recordingState = state; this.stateListeners.forEach(cb => cb(state)); } } /** Alte Aufnahme- und TTS-Files aus dem Cache loeschen. * Default 30s — verwendet beim Mikro-Start (kurze Lebensdauer reicht). * App-Start nutzt 5min damit gerade aktive Files nicht erwischt werden. */ private async _cleanupStaleCacheFiles(maxAgeMs: number = 30000): Promise { try { const files = await RNFS.readDir(RNFS.CachesDirectoryPath); const now = Date.now(); let removed = 0; let freedBytes = 0; for (const f of files) { if (!f.isFile()) continue; if (!f.name.startsWith('aria_recording_') && !f.name.startsWith('aria_tts_')) continue; const age = now - (f.mtime ? f.mtime.getTime() : 0); if (age > maxAgeMs) { freedBytes += parseInt(f.size as any, 10) || 0; await RNFS.unlink(f.path).catch(() => {}); removed += 1; } } if (removed > 0) { console.log('[Audio] Cache-Cleanup: %d Files entfernt, %.1fMB freigegeben', removed, freedBytes / 1024 / 1024); } } catch { // silent — cleanup ist best-effort } } /** Alte TTS-Cache-Dateien loeschen die nicht mehr referenziert sind (>30 Tage). */ async cleanupOldTTSCache(keepMessageIds: Set, maxAgeDays = 30): Promise { try { const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`; if (!(await RNFS.exists(dir))) return; const files = await RNFS.readDir(dir); const maxAgeMs = maxAgeDays * 24 * 60 * 60 * 1000; const now = Date.now(); for (const f of files) { if (!f.isFile() || !f.name.endsWith('.wav')) continue; const messageId = f.name.replace(/\.wav$/, ''); const age = now - (f.mtime ? f.mtime.getTime() : 0); // Loeschen wenn: nicht mehr referenziert UND aelter als X Tage if (!keepMessageIds.has(messageId) && age > maxAgeMs) { await RNFS.unlink(f.path).catch(() => {}); } } } catch { // silent } } /** Aktuelle Groesse des TTS-Caches. */ async getTtsCacheSize(): Promise<{ count: number; totalMB: number }> { let count = 0; let total = 0; try { const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`; if (await RNFS.exists(dir)) { const files = await RNFS.readDir(dir); for (const f of files) { if (!f.isFile() || !f.name.endsWith('.wav')) continue; count += 1; total += parseInt(f.size as any, 10) || 0; } } } catch {} return { count, totalMB: total / 1024 / 1024 }; } /** TTS-Cache komplett leeren (Settings-Button). */ async clearTtsCache(): Promise<{ removed: number; freedMB: number }> { let removed = 0; let freed = 0; try { const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`; if (!(await RNFS.exists(dir))) return { removed: 0, freedMB: 0 }; const files = await RNFS.readDir(dir); for (const f of files) { if (!f.isFile() || !f.name.endsWith('.wav')) continue; const size = parseInt(f.size as any, 10) || 0; await RNFS.unlink(f.path).catch(() => {}); removed += 1; freed += size; } } catch {} return { removed, freedMB: freed / 1024 / 1024 }; } } // Singleton const audioService = new AudioService(); export default audioService;