915 lines
33 KiB
TypeScript
915 lines
33 KiB
TypeScript
/**
|
|
* Audio-Service fuer Sprach-Ein-/Ausgabe
|
|
*
|
|
* Verwaltet Mikrofon-Aufnahme (mit VAD/Auto-Stop bei Stille),
|
|
* TTS-Audiowiedergabe und Metering fuer visuelle Feedback.
|
|
* Nutzt react-native-audio-recorder-player fuer Aufnahme.
|
|
*/
|
|
|
|
import { Platform, PermissionsAndroid, NativeModules } from 'react-native';
|
|
import Sound from 'react-native-sound';
|
|
import RNFS from 'react-native-fs';
|
|
import AsyncStorage from '@react-native-async-storage/async-storage';
|
|
import AudioRecorderPlayer, {
|
|
AudioEncoderAndroidType,
|
|
AudioSourceAndroidType,
|
|
AVEncodingOption,
|
|
OutputFormatAndroidType,
|
|
} from 'react-native-audio-recorder-player';
|
|
|
|
// Base64-Encoder fuer Binary-Strings (Header-Bytes → Base64)
|
|
const B64_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';
|
|
function btoaSafe(bin: string): string {
|
|
let out = '';
|
|
const len = bin.length;
|
|
for (let i = 0; i < len; i += 3) {
|
|
const b1 = bin.charCodeAt(i) & 0xff;
|
|
const b2 = i + 1 < len ? bin.charCodeAt(i + 1) & 0xff : 0;
|
|
const b3 = i + 2 < len ? bin.charCodeAt(i + 2) & 0xff : 0;
|
|
out += B64_CHARS[b1 >> 2];
|
|
out += B64_CHARS[((b1 & 0x03) << 4) | (b2 >> 4)];
|
|
out += i + 1 < len ? B64_CHARS[((b2 & 0x0f) << 2) | (b3 >> 6)] : '=';
|
|
out += i + 2 < len ? B64_CHARS[b3 & 0x3f] : '=';
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// Native Module fuer Audio-Focus (Ducking/Muten anderer Apps)
|
|
const { AudioFocus, PcmStreamPlayer } = NativeModules as {
|
|
AudioFocus?: {
|
|
requestDuck: () => Promise<boolean>;
|
|
requestExclusive: () => Promise<boolean>;
|
|
release: () => Promise<boolean>;
|
|
};
|
|
PcmStreamPlayer?: {
|
|
start: (sampleRate: number, channels: number, prerollSeconds: number) => Promise<boolean>;
|
|
writeChunk: (base64Pcm: string) => Promise<boolean>;
|
|
end: () => Promise<boolean>;
|
|
stop: () => Promise<boolean>;
|
|
};
|
|
};
|
|
|
|
// --- Typen ---
|
|
|
|
export interface RecordingResult {
|
|
/** Base64-kodierte Audiodaten */
|
|
base64: string;
|
|
/** Dauer in Millisekunden */
|
|
durationMs: number;
|
|
/** MIME-Type (z.B. audio/wav) */
|
|
mimeType: string;
|
|
}
|
|
|
|
export type RecordingState = 'idle' | 'recording' | 'processing';
|
|
|
|
type RecordingStateCallback = (state: RecordingState) => void;
|
|
type MeterCallback = (db: number) => void;
|
|
type SilenceCallback = () => void;
|
|
|
|
// --- Konstanten ---
|
|
|
|
const AUDIO_SAMPLE_RATE = 16000;
|
|
const AUDIO_CHANNELS = 1;
|
|
const AUDIO_ENCODING = 'audio/wav';
|
|
|
|
// VAD (Voice Activity Detection) — Stille-Erkennung
|
|
const VAD_SILENCE_THRESHOLD_DB = -45; // dB unter dem als "Stille" gilt
|
|
const VAD_SPEECH_THRESHOLD_DB = -28; // dB ueber dem als "Sprache" gilt (Sprach-Gate) — hoeher = weniger Umgebungsgeraeusche
|
|
const VAD_SPEECH_MIN_MS = 500; // ms Sprache bevor Aufnahme zaehlt — laenger = keine Huestler/Klopfer mehr
|
|
|
|
// VAD-Stille (in Sekunden) — wie lange Sprechpause toleriert wird, bevor
|
|
// die Aufnahme automatisch beendet wird. Einstellbar in den App-Settings.
|
|
export const VAD_SILENCE_DEFAULT_SEC = 2.8;
|
|
export const VAD_SILENCE_MIN_SEC = 1.0;
|
|
export const VAD_SILENCE_MAX_SEC = 8.0;
|
|
export const VAD_SILENCE_STORAGE_KEY = 'aria_vad_silence_sec';
|
|
|
|
// Konversations-Fenster (in Sekunden) — nach ARIA's Antwort hat der User so
|
|
// lange Zeit, im Gespraechsmodus weiter zu sprechen, ohne dass die Konversation
|
|
// beendet wird. Sprichst du im Fenster nichts → Konversation aus.
|
|
export const CONV_WINDOW_DEFAULT_SEC = 8.0;
|
|
export const CONV_WINDOW_MIN_SEC = 3.0;
|
|
export const CONV_WINDOW_MAX_SEC = 20.0;
|
|
export const CONV_WINDOW_STORAGE_KEY = 'aria_conv_window_sec';
|
|
|
|
// TTS-Wiedergabegeschwindigkeit — wird pro Geraet gespeichert und an die
|
|
// Bridge mitgegeben (speed-Param im F5-TTS infer()). 1.0 = normal.
|
|
export const TTS_SPEED_DEFAULT = 1.0;
|
|
export const TTS_SPEED_MIN = 0.1;
|
|
export const TTS_SPEED_MAX = 5.0;
|
|
export const TTS_SPEED_STORAGE_KEY = 'aria_tts_speed';
|
|
|
|
export async function loadTtsSpeed(): Promise<number> {
|
|
try {
|
|
const raw = await AsyncStorage.getItem(TTS_SPEED_STORAGE_KEY);
|
|
if (raw != null) {
|
|
const n = parseFloat(raw);
|
|
if (isFinite(n) && n >= TTS_SPEED_MIN && n <= TTS_SPEED_MAX) return n;
|
|
}
|
|
} catch {}
|
|
return TTS_SPEED_DEFAULT;
|
|
}
|
|
|
|
export async function loadConvWindowMs(): Promise<number> {
|
|
try {
|
|
const raw = await AsyncStorage.getItem(CONV_WINDOW_STORAGE_KEY);
|
|
if (raw != null) {
|
|
const n = parseFloat(raw);
|
|
if (isFinite(n) && n >= CONV_WINDOW_MIN_SEC && n <= CONV_WINDOW_MAX_SEC) {
|
|
return Math.round(n * 1000);
|
|
}
|
|
}
|
|
} catch {}
|
|
return Math.round(CONV_WINDOW_DEFAULT_SEC * 1000);
|
|
}
|
|
|
|
async function loadVadSilenceMs(): Promise<number> {
|
|
try {
|
|
const raw = await AsyncStorage.getItem(VAD_SILENCE_STORAGE_KEY);
|
|
if (raw != null) {
|
|
const n = parseFloat(raw);
|
|
if (isFinite(n) && n >= VAD_SILENCE_MIN_SEC && n <= VAD_SILENCE_MAX_SEC) {
|
|
return Math.round(n * 1000);
|
|
}
|
|
}
|
|
} catch {}
|
|
return Math.round(VAD_SILENCE_DEFAULT_SEC * 1000);
|
|
}
|
|
|
|
// Max-Dauer einer Aufnahme (Notbremse gegen Runaway-Loops). Auf 2 Minuten
|
|
// hochgezogen damit auch laengere Erklaerungen durchgehen.
|
|
const MAX_RECORDING_MS = 120000;
|
|
|
|
// Pre-Roll: Wie lange Audio im AudioTrack-Buffer liegt bevor play() startet.
|
|
// Einstellbar via Diagnostic/Settings (Key: aria_tts_preroll_sec).
|
|
export const TTS_PREROLL_DEFAULT_SEC = 3.5;
|
|
export const TTS_PREROLL_MIN_SEC = 0; // 0 = sofort abspielen (F5-TTS ist schnell genug)
|
|
export const TTS_PREROLL_MAX_SEC = 6.0;
|
|
export const TTS_PREROLL_STORAGE_KEY = 'aria_tts_preroll_sec';
|
|
|
|
async function loadPrerollSec(): Promise<number> {
|
|
try {
|
|
const raw = await AsyncStorage.getItem(TTS_PREROLL_STORAGE_KEY);
|
|
if (raw != null) {
|
|
const n = parseFloat(raw);
|
|
if (isFinite(n) && n >= TTS_PREROLL_MIN_SEC && n <= TTS_PREROLL_MAX_SEC) {
|
|
return n;
|
|
}
|
|
}
|
|
} catch {}
|
|
return TTS_PREROLL_DEFAULT_SEC;
|
|
}
|
|
|
|
// --- Audio-Service ---
|
|
|
|
class AudioService {
|
|
private recordingState: RecordingState = 'idle';
|
|
private recordingStartTime: number = 0;
|
|
private stateListeners: RecordingStateCallback[] = [];
|
|
private meterListeners: MeterCallback[] = [];
|
|
private silenceListeners: SilenceCallback[] = [];
|
|
private currentSound: Sound | null = null;
|
|
private recorder: AudioRecorderPlayer;
|
|
private recordingPath: string = '';
|
|
|
|
// Audio-Queue fuer sequentielle TTS-Wiedergabe
|
|
private audioQueue: string[] = [];
|
|
private isPlaying: boolean = false;
|
|
private preloadedSound: Sound | null = null;
|
|
private preloadedPath: string = '';
|
|
|
|
// Sprach-Gate: Aufnahme erst senden wenn tatsaechlich gesprochen wurde
|
|
private speechDetected: boolean = false;
|
|
private speechStartTime: number = 0;
|
|
|
|
// PCM-Stream (XTTS): aktive Session + Cache-Puffer pro messageId
|
|
private pcmStreamActive: boolean = false;
|
|
private pcmMessageId: string = '';
|
|
private pcmSampleRate: number = 24000;
|
|
private pcmChannels: number = 1;
|
|
private pcmBuffer: string[] = []; // base64-chunks zum spaeteren WAV-Build
|
|
private pcmBytesCollected: number = 0;
|
|
private readonly PCM_MAX_CACHE_BYTES = 30 * 1024 * 1024; // 30MB
|
|
|
|
// AudioFocus wird verzoegert freigegeben — wenn ARIA eine zweite Antwort
|
|
// direkt hinterherschickt (oder ein neuer Stream startet), bleibt Spotify
|
|
// pausiert. Ohne diese Verzoegerung springt Spotify im Mikro-Sekunden-Gap
|
|
// zwischen zwei Streams kurz wieder an.
|
|
private focusReleaseTimer: ReturnType<typeof setTimeout> | null = null;
|
|
private readonly FOCUS_RELEASE_DELAY_MS = 800;
|
|
|
|
// Conversation-Mode: solange aktiv (Wake-Word Status 'conversing' ODER
|
|
// wir wissen "ARIA spricht gerade in einem Multi-Turn-Dialog"), halten wir
|
|
// den AudioFocus DAUERHAFT. Der per-Stream-Release wird unterdrueckt,
|
|
// damit Spotify nicht in Render-Pausen oder zwischen Antworten zurueckkehrt.
|
|
private _conversationFocusActive: boolean = false;
|
|
|
|
// VAD State
|
|
private vadEnabled: boolean = false;
|
|
private lastSpeechTime: number = 0;
|
|
private vadTimer: ReturnType<typeof setInterval> | null = null;
|
|
private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
|
|
// Latch damit der Silence-Callback pro Aufnahme genau einmal feuert
|
|
private silenceFired: boolean = false;
|
|
private noSpeechTimer: ReturnType<typeof setTimeout> | null = null;
|
|
|
|
constructor() {
|
|
this.recorder = new AudioRecorderPlayer();
|
|
this.recorder.setSubscriptionDuration(0.1); // 100ms Metering-Updates
|
|
}
|
|
|
|
/** AudioFocus mit kleiner Verzoegerung freigeben — Spotify/YouTube
|
|
* springen sonst im Gap zwischen zwei TTS-Streams (oder wenn ARIA
|
|
* eine zweite Antwort direkt hinterherschickt) kurz wieder an.
|
|
* Im Conversation-Mode (Wake-Word conversing) wird das Release komplett
|
|
* unterdrueckt — der Focus bleibt fuer die ganze Konversation gehalten. */
|
|
private _releaseFocusDeferred(): void {
|
|
if (this._conversationFocusActive) {
|
|
this._cancelDeferredFocusRelease();
|
|
return;
|
|
}
|
|
this._cancelDeferredFocusRelease();
|
|
this.focusReleaseTimer = setTimeout(() => {
|
|
this.focusReleaseTimer = null;
|
|
if (this._conversationFocusActive) return;
|
|
AudioFocus?.release().catch(() => {});
|
|
}, this.FOCUS_RELEASE_DELAY_MS);
|
|
}
|
|
|
|
private _cancelDeferredFocusRelease(): void {
|
|
if (this.focusReleaseTimer) {
|
|
clearTimeout(this.focusReleaseTimer);
|
|
this.focusReleaseTimer = null;
|
|
}
|
|
}
|
|
|
|
/** Conversation-Mode beginnt → AudioFocus dauerhaft halten (Spotify bleibt
|
|
* pausiert). Idempotent: mehrfaches Aufrufen ist sicher. */
|
|
acquireConversationFocus(): void {
|
|
if (this._conversationFocusActive) return;
|
|
this._conversationFocusActive = true;
|
|
this._cancelDeferredFocusRelease();
|
|
console.log('[Audio] Conversation-Focus aktiv (Spotify bleibt gepaust)');
|
|
AudioFocus?.requestDuck().catch(() => {});
|
|
}
|
|
|
|
/** Conversation-Mode endet → Focus darf wieder freigegeben werden
|
|
* (verzoegert, damit eine direkt folgende Antwort nichts kaputtmacht). */
|
|
releaseConversationFocus(): void {
|
|
if (!this._conversationFocusActive) return;
|
|
this._conversationFocusActive = false;
|
|
console.log('[Audio] Conversation-Focus inaktiv');
|
|
this._releaseFocusDeferred();
|
|
}
|
|
|
|
/** TTS-Wiedergabe haart stoppen — z.B. wenn ein Anruf reinkommt.
|
|
* Released auch sofort den AudioFocus damit der Anruf-Klingelton hoerbar ist. */
|
|
haltAllPlayback(reason: string = ''): void {
|
|
console.log('[Audio] haltAllPlayback: %s', reason || '(no reason)');
|
|
this._conversationFocusActive = false;
|
|
this.stopPlayback();
|
|
}
|
|
|
|
// --- Berechtigungen ---
|
|
|
|
async requestMicrophonePermission(): Promise<boolean> {
|
|
if (Platform.OS !== 'android') {
|
|
return true;
|
|
}
|
|
|
|
try {
|
|
const granted = await PermissionsAndroid.request(
|
|
PermissionsAndroid.PERMISSIONS.RECORD_AUDIO,
|
|
{
|
|
title: 'ARIA Cockpit - Mikrofon',
|
|
message: 'ARIA benoetigt Zugriff auf das Mikrofon fuer Spracheingabe.',
|
|
buttonPositive: 'Erlauben',
|
|
buttonNegative: 'Ablehnen',
|
|
},
|
|
);
|
|
return granted === PermissionsAndroid.RESULTS.GRANTED;
|
|
} catch (err) {
|
|
console.error('[Audio] Fehler bei Berechtigungsanfrage:', err);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// --- Aufnahme ---
|
|
|
|
/** Mikrofon-Aufnahme starten.
|
|
*
|
|
* @param autoStop VAD aktivieren — Auto-Stop bei Stille
|
|
* @param noSpeechTimeoutMs Wenn der User innerhalb dieser Zeit nichts sagt,
|
|
* wird Stille gemeldet (Recording wird verworfen).
|
|
* Fuer Conversation-Window: nach ARIA's Antwort
|
|
* hast du nur N Sekunden um anzufangen, sonst
|
|
* Gespraech zu Ende.
|
|
*/
|
|
async startRecording(autoStop: boolean = false, noSpeechTimeoutMs: number = 0): Promise<boolean> {
|
|
if (this.recordingState !== 'idle') {
|
|
console.warn('[Audio] Aufnahme laeuft bereits');
|
|
return false;
|
|
}
|
|
|
|
const hasPermission = await this.requestMicrophonePermission();
|
|
if (!hasPermission) {
|
|
console.warn('[Audio] Keine Mikrofon-Berechtigung');
|
|
return false;
|
|
}
|
|
|
|
try {
|
|
// Laufende Wiedergabe stoppen (damit ARIA sich nicht selbst hoert)
|
|
this.stopPlayback();
|
|
|
|
// Aufraeumen: Alte aria_recording_ und aria_tts_ Files loeschen
|
|
// (Schutz gegen Cache-Ueberlauf im Gespraechsmodus bei vielen Zyklen)
|
|
this._cleanupStaleCacheFiles().catch(() => {});
|
|
|
|
this.recordingPath = `${RNFS.CachesDirectoryPath}/aria_recording_${Date.now()}.mp4`;
|
|
|
|
// Aufnahme mit Metering starten
|
|
await this.recorder.startRecorder(this.recordingPath, {
|
|
AudioEncoderAndroid: AudioEncoderAndroidType.AAC,
|
|
AudioSourceAndroid: AudioSourceAndroidType.MIC,
|
|
OutputFormatAndroid: OutputFormatAndroidType.MPEG_4,
|
|
AudioSamplingRateAndroid: 16000,
|
|
AudioChannelsAndroid: 1,
|
|
}, true); // meteringEnabled = true
|
|
|
|
// Metering-Callback
|
|
this.recorder.addRecordBackListener((e) => {
|
|
const db = e.currentMetering ?? -160;
|
|
this.meterListeners.forEach(cb => cb(db));
|
|
|
|
// Sprach-Gate: Erkennen ob tatsaechlich gesprochen wird
|
|
if (db > VAD_SPEECH_THRESHOLD_DB) {
|
|
if (!this.speechDetected && this.speechStartTime === 0) {
|
|
this.speechStartTime = Date.now();
|
|
}
|
|
if (this.speechStartTime > 0 && Date.now() - this.speechStartTime >= VAD_SPEECH_MIN_MS) {
|
|
this.speechDetected = true;
|
|
}
|
|
} else {
|
|
if (!this.speechDetected) {
|
|
this.speechStartTime = 0; // Reset wenn noch nicht als Sprache erkannt
|
|
}
|
|
}
|
|
|
|
// VAD: Stille erkennen (nur wenn Sprache erkannt wurde)
|
|
if (this.vadEnabled) {
|
|
if (db > VAD_SILENCE_THRESHOLD_DB) {
|
|
this.lastSpeechTime = Date.now();
|
|
}
|
|
}
|
|
});
|
|
|
|
this.recordingStartTime = Date.now();
|
|
this.lastSpeechTime = Date.now();
|
|
this.speechDetected = false;
|
|
this.speechStartTime = 0;
|
|
this.setState('recording');
|
|
|
|
// Andere Apps waehrend der Aufnahme pausieren (Musik, Videos etc.)
|
|
this._cancelDeferredFocusRelease();
|
|
AudioFocus?.requestExclusive().catch(() => {});
|
|
|
|
// VAD aktivieren — Stille-Dauer aus AsyncStorage (Settings-konfigurierbar).
|
|
// WICHTIG: jeder Trigger (VAD-Stille / Max-Dauer / No-Speech-Window)
|
|
// disable SOFORT den VAD-Flag und clear den Timer, BEVOR die Listener
|
|
// gefeuert werden. Sonst feuert das setInterval weiter alle 200ms und
|
|
// ruft stopRecording parallel auf → audio-recorder-player crasht.
|
|
this.vadEnabled = autoStop;
|
|
this.silenceFired = false;
|
|
const fireSilenceOnce = (reason: string) => {
|
|
if (this.silenceFired) return;
|
|
this.silenceFired = true;
|
|
this.vadEnabled = false;
|
|
if (this.vadTimer) { clearInterval(this.vadTimer); this.vadTimer = null; }
|
|
if (this.maxDurationTimer) { clearTimeout(this.maxDurationTimer); this.maxDurationTimer = null; }
|
|
if (this.noSpeechTimer) { clearTimeout(this.noSpeechTimer); this.noSpeechTimer = null; }
|
|
console.log('[Audio] Silence-Fire: %s', reason);
|
|
this.silenceListeners.forEach(cb => {
|
|
try { cb(); } catch (e) { console.warn('[Audio] silence listener err:', e); }
|
|
});
|
|
};
|
|
if (autoStop) {
|
|
const vadSilenceMs = await loadVadSilenceMs();
|
|
console.log('[Audio] startRecording: autoStop=true, VAD-Stille=%dms, MAX=%dms',
|
|
vadSilenceMs, MAX_RECORDING_MS);
|
|
this.vadTimer = setInterval(() => {
|
|
const silenceDuration = Date.now() - this.lastSpeechTime;
|
|
if (silenceDuration >= vadSilenceMs) {
|
|
fireSilenceOnce(`VAD ${silenceDuration}ms Stille (Schwelle=${vadSilenceMs}ms)`);
|
|
}
|
|
}, 200);
|
|
// Notbremse: Nach MAX_RECORDING_MS zwangsweise stoppen
|
|
this.maxDurationTimer = setTimeout(() => {
|
|
fireSilenceOnce(`Max-Dauer ${MAX_RECORDING_MS}ms`);
|
|
}, MAX_RECORDING_MS);
|
|
}
|
|
|
|
// Conversation-Window: Wenn der User innerhalb noSpeechTimeoutMs nicht
|
|
// anfaengt zu sprechen → Aufnahme abbrechen (Speech-Gate verwirft sie).
|
|
if (noSpeechTimeoutMs > 0) {
|
|
this.noSpeechTimer = setTimeout(() => {
|
|
if (!this.speechDetected && this.recordingState === 'recording') {
|
|
fireSilenceOnce(`Conversation-Window ${noSpeechTimeoutMs}ms ohne Sprache`);
|
|
}
|
|
}, noSpeechTimeoutMs);
|
|
}
|
|
|
|
console.log('[Audio] Aufnahme gestartet (autoStop: %s)', autoStop);
|
|
return true;
|
|
} catch (err) {
|
|
console.error('[Audio] Fehler beim Starten der Aufnahme:', err);
|
|
this.setState('idle');
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/** Aufnahme stoppen und Ergebnis zurueckgeben */
|
|
async stopRecording(): Promise<RecordingResult | null> {
|
|
if (this.recordingState !== 'recording') {
|
|
console.warn('[Audio] Keine aktive Aufnahme');
|
|
return null;
|
|
}
|
|
|
|
this.setState('processing');
|
|
this.vadEnabled = false;
|
|
if (this.vadTimer) {
|
|
clearInterval(this.vadTimer);
|
|
this.vadTimer = null;
|
|
}
|
|
if (this.maxDurationTimer) {
|
|
clearTimeout(this.maxDurationTimer);
|
|
this.maxDurationTimer = null;
|
|
}
|
|
if (this.noSpeechTimer) {
|
|
clearTimeout(this.noSpeechTimer);
|
|
this.noSpeechTimer = null;
|
|
}
|
|
|
|
try {
|
|
await this.recorder.stopRecorder();
|
|
this.recorder.removeRecordBackListener();
|
|
|
|
// Audio-Focus verzoegert freigeben — gleich kommt die TTS-Antwort,
|
|
// im Gap soll Spotify nicht hochkommen.
|
|
this._releaseFocusDeferred();
|
|
|
|
const durationMs = Date.now() - this.recordingStartTime;
|
|
const hadSpeech = this.speechDetected;
|
|
|
|
// Sprach-Gate: Wenn keine Sprache erkannt → Aufnahme verwerfen
|
|
if (!hadSpeech) {
|
|
RNFS.unlink(this.recordingPath).catch(() => {});
|
|
this.setState('idle');
|
|
console.log('[Audio] Aufnahme verworfen — keine Sprache erkannt (nur Umgebungsgeraeusche)');
|
|
return null;
|
|
}
|
|
|
|
// Audio-Datei als Base64 lesen
|
|
const base64Data = await RNFS.readFile(this.recordingPath, 'base64');
|
|
|
|
// Temp-Datei aufraeumen
|
|
RNFS.unlink(this.recordingPath).catch(() => {});
|
|
|
|
this.setState('idle');
|
|
console.log(`[Audio] Aufnahme beendet (${durationMs}ms, ${Math.round(base64Data.length / 1024)}KB, Sprache erkannt)`);
|
|
|
|
return {
|
|
base64: base64Data,
|
|
durationMs,
|
|
mimeType: 'audio/mp4', // AAC in MP4 Container
|
|
};
|
|
} catch (err) {
|
|
console.error('[Audio] Fehler beim Stoppen der Aufnahme:', err);
|
|
this.setState('idle');
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// --- Wiedergabe ---
|
|
|
|
/** Base64-kodiertes Audio in die Queue stellen und abspielen */
|
|
async playAudio(base64Data: string): Promise<void> {
|
|
if (!base64Data) return;
|
|
|
|
this.audioQueue.push(base64Data);
|
|
if (!this.isPlaying) {
|
|
this._playNext();
|
|
}
|
|
}
|
|
|
|
/** Base64-Audio persistent speichern. Gibt file:// Pfad zurueck (oder leer bei Fehler). */
|
|
async cacheAudio(base64Data: string, messageId: string): Promise<string> {
|
|
if (!base64Data || !messageId) return '';
|
|
try {
|
|
const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`;
|
|
await RNFS.mkdir(dir).catch(() => {});
|
|
const path = `${dir}/${messageId}.wav`;
|
|
// Wenn Datei schon existiert (z.B. XTTS Chunks) → anhaengen statt ueberschreiben
|
|
const exists = await RNFS.exists(path);
|
|
if (exists) {
|
|
// Bestehende + neue Base64 laden, zusammenkleben (fuer jetzt: ueberschreiben)
|
|
// XTTS sendet mehrere Chunks — bei mehrfacher Ueberschreibung bleibt nur der letzte
|
|
// Fuer eine echte Konkatenation muesste WAV-Header gemerged werden
|
|
await RNFS.writeFile(path, base64Data, 'base64');
|
|
} else {
|
|
await RNFS.writeFile(path, base64Data, 'base64');
|
|
}
|
|
return `file://${path}`;
|
|
} catch (err) {
|
|
console.warn('[Audio] cacheAudio fehlgeschlagen:', err);
|
|
return '';
|
|
}
|
|
}
|
|
|
|
/** Einen PCM-Chunk aus einer audio_pcm Nachricht empfangen.
|
|
* silent=true → nur cachen, nicht abspielen (z.B. wenn TTS geraetelokal gemutet).
|
|
* Gibt bei final=true den Cache-Pfad zurueck (file://) oder '' wenn nicht gecached.
|
|
*
|
|
* Wrapper serialisiert aufeinanderfolgende Chunk-Calls via Promise-Queue —
|
|
* sonst gabs bei kurzen Streams einen Race: final-Chunk konnte `end()` rufen
|
|
* BEVOR der vorherige `start()` im Native-Modul fertig war. Der Writer-
|
|
* Thread sah dann endRequested=true ohne jemals Chunks zu verarbeiten. */
|
|
private _pcmChunkQueue: Promise<any> = Promise.resolve();
|
|
async handlePcmChunk(payload: {
|
|
base64: string;
|
|
sampleRate?: number;
|
|
channels?: number;
|
|
messageId?: string;
|
|
chunk?: number;
|
|
final?: boolean;
|
|
silent?: boolean;
|
|
}): Promise<string> {
|
|
const p = this._pcmChunkQueue.then(() => this._handlePcmChunkImpl(payload)).catch(err => {
|
|
console.warn('[Audio] handlePcmChunk queued err:', err);
|
|
return '';
|
|
});
|
|
// Chain only on the side effect — callers still get the per-call result
|
|
this._pcmChunkQueue = p;
|
|
return p;
|
|
}
|
|
|
|
private async _handlePcmChunkImpl(payload: {
|
|
base64: string;
|
|
sampleRate?: number;
|
|
channels?: number;
|
|
messageId?: string;
|
|
chunk?: number;
|
|
final?: boolean;
|
|
silent?: boolean;
|
|
}): Promise<string> {
|
|
const silent = !!payload.silent;
|
|
if (!silent && !PcmStreamPlayer) {
|
|
console.warn('[Audio] PcmStreamPlayer Native Module nicht verfuegbar');
|
|
return '';
|
|
}
|
|
// Debug-Log bei Chunk 0 eines neuen Streams — damit man im adb logcat
|
|
// sieht warum der Auto-Playback greift oder nicht.
|
|
if ((payload.chunk ?? 0) === 0 && !this.pcmStreamActive) {
|
|
console.log('[Audio] PCM-Stream start: silent=%s messageId=%s sr=%s ch=%s',
|
|
silent, payload.messageId || '(none)',
|
|
payload.sampleRate, payload.channels);
|
|
}
|
|
|
|
const messageId = payload.messageId || '';
|
|
const sampleRate = payload.sampleRate || 24000;
|
|
const channels = payload.channels || 1;
|
|
const base64 = payload.base64 || '';
|
|
const isFinal = !!payload.final;
|
|
|
|
// Neuer Stream? (messageId Wechsel oder nicht aktiv)
|
|
if (!this.pcmStreamActive || this.pcmMessageId !== messageId) {
|
|
if (this.pcmStreamActive && !silent) {
|
|
try { await PcmStreamPlayer!.stop(); } catch {}
|
|
this.pcmBuffer = [];
|
|
this.pcmBytesCollected = 0;
|
|
}
|
|
this.pcmStreamActive = true;
|
|
this.pcmMessageId = messageId;
|
|
this.pcmSampleRate = sampleRate;
|
|
this.pcmChannels = channels;
|
|
this.pcmBuffer = [];
|
|
this.pcmBytesCollected = 0;
|
|
if (!silent) {
|
|
const prerollSec = await loadPrerollSec();
|
|
try {
|
|
await PcmStreamPlayer!.start(sampleRate, channels, prerollSec);
|
|
} catch (err) {
|
|
console.error('[Audio] PcmStreamPlayer.start fehlgeschlagen:', err);
|
|
this.pcmStreamActive = false;
|
|
return '';
|
|
}
|
|
this._cancelDeferredFocusRelease();
|
|
AudioFocus?.requestDuck().catch(() => {});
|
|
}
|
|
}
|
|
|
|
// Chunk — immer cachen, nur bei !silent auch abspielen
|
|
if (base64) {
|
|
if (!silent) {
|
|
try { await PcmStreamPlayer!.writeChunk(base64); } catch (err) { console.warn('[Audio] writeChunk', err); }
|
|
}
|
|
if (messageId && this.pcmBytesCollected < this.PCM_MAX_CACHE_BYTES) {
|
|
this.pcmBuffer.push(base64);
|
|
this.pcmBytesCollected += Math.floor(base64.length * 0.75);
|
|
}
|
|
}
|
|
|
|
if (isFinal) {
|
|
if (!silent) {
|
|
// end() resolved jetzt erst wenn der native Writer-Thread fertig
|
|
// ist (alle Samples ausgespielt) — danach AudioFocus verzoegert
|
|
// freigeben, damit Spotify/YouTube nicht im Mikro-Gap zwischen zwei
|
|
// ARIA-Antworten wieder hochdrehen. Wenn ein neuer Stream innerhalb
|
|
// FOCUS_RELEASE_DELAY_MS startet, wird das Release abgebrochen.
|
|
try { await PcmStreamPlayer!.end(); } catch {}
|
|
this._releaseFocusDeferred();
|
|
}
|
|
this.pcmStreamActive = false;
|
|
|
|
if (messageId && this.pcmBuffer.length > 0) {
|
|
const audioPath = await this._savePcmBufferAsWav(messageId);
|
|
this.pcmBuffer = [];
|
|
this.pcmBytesCollected = 0;
|
|
this.pcmMessageId = '';
|
|
return audioPath;
|
|
}
|
|
this.pcmMessageId = '';
|
|
}
|
|
return '';
|
|
}
|
|
|
|
/** Gesammelte PCM-Chunks als WAV speichern. Gibt file:// Pfad zurueck. */
|
|
private async _savePcmBufferAsWav(messageId: string): Promise<string> {
|
|
try {
|
|
const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`;
|
|
await RNFS.mkdir(dir).catch(() => {});
|
|
const path = `${dir}/${messageId}.wav`;
|
|
|
|
// WAV-Header fuer PCM s16le
|
|
const sampleRate = this.pcmSampleRate;
|
|
const channels = this.pcmChannels;
|
|
const bitsPerSample = 16;
|
|
const byteRate = sampleRate * channels * bitsPerSample / 8;
|
|
const blockAlign = channels * bitsPerSample / 8;
|
|
const dataSize = this.pcmBytesCollected;
|
|
const fileSize = 36 + dataSize;
|
|
|
|
// Header als Base64 (44 bytes)
|
|
const header = new Uint8Array(44);
|
|
const dv = new DataView(header.buffer);
|
|
// "RIFF"
|
|
header[0] = 0x52; header[1] = 0x49; header[2] = 0x46; header[3] = 0x46;
|
|
dv.setUint32(4, fileSize, true);
|
|
// "WAVE"
|
|
header[8] = 0x57; header[9] = 0x41; header[10] = 0x56; header[11] = 0x45;
|
|
// "fmt "
|
|
header[12] = 0x66; header[13] = 0x6d; header[14] = 0x74; header[15] = 0x20;
|
|
dv.setUint32(16, 16, true); // fmt chunk size
|
|
dv.setUint16(20, 1, true); // PCM format
|
|
dv.setUint16(22, channels, true);
|
|
dv.setUint32(24, sampleRate, true);
|
|
dv.setUint32(28, byteRate, true);
|
|
dv.setUint16(32, blockAlign, true);
|
|
dv.setUint16(34, bitsPerSample, true);
|
|
// "data"
|
|
header[36] = 0x64; header[37] = 0x61; header[38] = 0x74; header[39] = 0x61;
|
|
dv.setUint32(40, dataSize, true);
|
|
|
|
// Header als base64
|
|
let headerB64 = '';
|
|
const chunk = 1024;
|
|
for (let i = 0; i < header.length; i += chunk) {
|
|
headerB64 += String.fromCharCode(...Array.from(header.slice(i, i + chunk)));
|
|
}
|
|
headerB64 = btoaSafe(headerB64);
|
|
|
|
// Datei schreiben: Header + alle PCM-Chunks
|
|
await RNFS.writeFile(path, headerB64, 'base64');
|
|
for (const b64 of this.pcmBuffer) {
|
|
await RNFS.appendFile(path, b64, 'base64');
|
|
}
|
|
console.log(`[Audio] PCM-Cache geschrieben: ${path} (${(dataSize / 1024).toFixed(0)}KB, ${this.pcmBuffer.length} chunks)`);
|
|
return `file://${path}`;
|
|
} catch (err) {
|
|
console.warn('[Audio] _savePcmBufferAsWav fehlgeschlagen:', err);
|
|
return '';
|
|
}
|
|
}
|
|
|
|
/** Audio aus lokaler Datei (file:// Pfad) in die Queue und abspielen. */
|
|
async playFromPath(filePath: string): Promise<void> {
|
|
if (!filePath) return;
|
|
try {
|
|
const cleanPath = filePath.replace(/^file:\/\//, '');
|
|
if (!(await RNFS.exists(cleanPath))) {
|
|
console.warn('[Audio] Cache-Datei existiert nicht mehr:', cleanPath);
|
|
return;
|
|
}
|
|
const b64 = await RNFS.readFile(cleanPath, 'base64');
|
|
this.playAudio(b64);
|
|
} catch (err) {
|
|
console.warn('[Audio] playFromPath fehlgeschlagen:', err);
|
|
}
|
|
}
|
|
|
|
// Callback wenn alle Audio-Teile abgespielt sind
|
|
private playbackFinishedListeners: (() => void)[] = [];
|
|
|
|
onPlaybackFinished(callback: () => void): () => void {
|
|
this.playbackFinishedListeners.push(callback);
|
|
return () => {
|
|
this.playbackFinishedListeners = this.playbackFinishedListeners.filter(cb => cb !== callback);
|
|
};
|
|
}
|
|
|
|
/** Naechstes Audio aus der Queue abspielen */
|
|
private async _playNext(): Promise<void> {
|
|
if (this.audioQueue.length === 0) {
|
|
this.isPlaying = false;
|
|
// Audio-Focus verzoegert abgeben → wenn gleich noch eine Antwort kommt,
|
|
// bleibt Spotify pausiert.
|
|
this._releaseFocusDeferred();
|
|
// Alle Audio-Teile abgespielt → Listener benachrichtigen
|
|
this.playbackFinishedListeners.forEach(cb => cb());
|
|
return;
|
|
}
|
|
|
|
// Beim ersten Playback-Start: andere Apps ducken
|
|
if (!this.isPlaying) {
|
|
this._cancelDeferredFocusRelease();
|
|
AudioFocus?.requestDuck().catch(() => {});
|
|
}
|
|
this.isPlaying = true;
|
|
|
|
// Preloaded Sound verwenden wenn verfuegbar, sonst neu laden
|
|
let sound: Sound;
|
|
let soundPath: string;
|
|
|
|
if (this.preloadedSound) {
|
|
sound = this.preloadedSound;
|
|
soundPath = this.preloadedPath;
|
|
this.preloadedSound = null;
|
|
this.preloadedPath = '';
|
|
// Daten aus Queue entfernen (wurde schon preloaded)
|
|
this.audioQueue.shift();
|
|
} else {
|
|
const base64Data = this.audioQueue.shift()!;
|
|
try {
|
|
soundPath = `${RNFS.CachesDirectoryPath}/aria_tts_${Date.now()}.wav`;
|
|
await RNFS.writeFile(soundPath, base64Data, 'base64');
|
|
sound = await new Promise<Sound>((resolve, reject) => {
|
|
const s = new Sound(soundPath, '', (err) => err ? reject(err) : resolve(s));
|
|
});
|
|
} catch (err) {
|
|
console.error('[Audio] Laden fehlgeschlagen:', err);
|
|
this._playNext();
|
|
return;
|
|
}
|
|
}
|
|
|
|
this.currentSound = sound;
|
|
|
|
// Naechstes Audio schon vorbereiten waehrend dieses abspielt
|
|
this._preloadNext();
|
|
|
|
sound.play((success) => {
|
|
if (!success) console.warn('[Audio] Wiedergabe fehlgeschlagen');
|
|
sound.release();
|
|
this.currentSound = null;
|
|
RNFS.unlink(soundPath).catch(() => {});
|
|
this._playNext();
|
|
});
|
|
}
|
|
|
|
/** Naechstes Audio im Hintergrund vorladen (verhindert Stottern) */
|
|
private async _preloadNext(): Promise<void> {
|
|
if (this.audioQueue.length === 0 || this.preloadedSound) return;
|
|
|
|
const base64Data = this.audioQueue[0]; // Nicht shift — bleibt in Queue
|
|
try {
|
|
const tmpPath = `${RNFS.CachesDirectoryPath}/aria_tts_pre_${Date.now()}.wav`;
|
|
await RNFS.writeFile(tmpPath, base64Data, 'base64');
|
|
this.preloadedSound = await new Promise<Sound>((resolve, reject) => {
|
|
const s = new Sound(tmpPath, '', (err) => err ? reject(err) : resolve(s));
|
|
});
|
|
this.preloadedPath = tmpPath;
|
|
} catch {
|
|
this.preloadedSound = null;
|
|
this.preloadedPath = '';
|
|
}
|
|
}
|
|
|
|
/** Laufende Wiedergabe stoppen + Queue leeren */
|
|
stopPlayback(): void {
|
|
this.audioQueue = [];
|
|
this.isPlaying = false;
|
|
if (this.currentSound) {
|
|
this.currentSound.stop();
|
|
this.currentSound.release();
|
|
this.currentSound = null;
|
|
}
|
|
if (this.preloadedSound) {
|
|
this.preloadedSound.release();
|
|
this.preloadedSound = null;
|
|
if (this.preloadedPath) RNFS.unlink(this.preloadedPath).catch(() => {});
|
|
this.preloadedPath = '';
|
|
}
|
|
// PCM-Stream ebenfalls hart stoppen (Cancel/Abbruch)
|
|
if (this.pcmStreamActive) {
|
|
PcmStreamPlayer?.stop().catch(() => {});
|
|
this.pcmStreamActive = false;
|
|
this.pcmBuffer = [];
|
|
this.pcmBytesCollected = 0;
|
|
this.pcmMessageId = '';
|
|
}
|
|
// Audio-Focus sofort freigeben — User hat explizit abgebrochen
|
|
this._cancelDeferredFocusRelease();
|
|
AudioFocus?.release().catch(() => {});
|
|
}
|
|
|
|
// --- Status & Callbacks ---
|
|
|
|
getRecordingState(): RecordingState {
|
|
return this.recordingState;
|
|
}
|
|
|
|
/** Callback fuer Aufnahmestatus-Aenderungen */
|
|
onStateChange(callback: RecordingStateCallback): () => void {
|
|
this.stateListeners.push(callback);
|
|
return () => {
|
|
this.stateListeners = this.stateListeners.filter(cb => cb !== callback);
|
|
};
|
|
}
|
|
|
|
/** Callback fuer Metering-Updates (dB Werte waehrend Aufnahme) */
|
|
onMeterUpdate(callback: MeterCallback): () => void {
|
|
this.meterListeners.push(callback);
|
|
return () => {
|
|
this.meterListeners = this.meterListeners.filter(cb => cb !== callback);
|
|
};
|
|
}
|
|
|
|
/** Callback wenn VAD Stille erkennt (Auto-Stop) */
|
|
onSilenceDetected(callback: SilenceCallback): () => void {
|
|
this.silenceListeners.push(callback);
|
|
return () => {
|
|
this.silenceListeners = this.silenceListeners.filter(cb => cb !== callback);
|
|
};
|
|
}
|
|
|
|
private setState(state: RecordingState): void {
|
|
if (this.recordingState !== state) {
|
|
this.recordingState = state;
|
|
this.stateListeners.forEach(cb => cb(state));
|
|
}
|
|
}
|
|
|
|
/** Alte Aufnahme- und TTS-Files aus dem Cache loeschen (>30s alt). */
|
|
private async _cleanupStaleCacheFiles(): Promise<void> {
|
|
try {
|
|
const files = await RNFS.readDir(RNFS.CachesDirectoryPath);
|
|
const now = Date.now();
|
|
for (const f of files) {
|
|
if (!f.isFile()) continue;
|
|
if (!f.name.startsWith('aria_recording_') && !f.name.startsWith('aria_tts_')) continue;
|
|
const age = now - (f.mtime ? f.mtime.getTime() : 0);
|
|
if (age > 30000) {
|
|
await RNFS.unlink(f.path).catch(() => {});
|
|
}
|
|
}
|
|
} catch {
|
|
// silent — cleanup ist best-effort
|
|
}
|
|
}
|
|
|
|
/** Alte TTS-Cache-Dateien loeschen die nicht mehr referenziert sind (>30 Tage). */
|
|
async cleanupOldTTSCache(keepMessageIds: Set<string>, maxAgeDays = 30): Promise<void> {
|
|
try {
|
|
const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`;
|
|
if (!(await RNFS.exists(dir))) return;
|
|
const files = await RNFS.readDir(dir);
|
|
const maxAgeMs = maxAgeDays * 24 * 60 * 60 * 1000;
|
|
const now = Date.now();
|
|
for (const f of files) {
|
|
if (!f.isFile() || !f.name.endsWith('.wav')) continue;
|
|
const messageId = f.name.replace(/\.wav$/, '');
|
|
const age = now - (f.mtime ? f.mtime.getTime() : 0);
|
|
// Loeschen wenn: nicht mehr referenziert UND aelter als X Tage
|
|
if (!keepMessageIds.has(messageId) && age > maxAgeMs) {
|
|
await RNFS.unlink(f.path).catch(() => {});
|
|
}
|
|
}
|
|
} catch {
|
|
// silent
|
|
}
|
|
}
|
|
}
|
|
|
|
// Singleton
|
|
const audioService = new AudioService();
|
|
export default audioService;
|