feat(app): Streaming-STT-Pipeline — Phase 1+2 verdrahtet
audio.ts:
- neue Methoden startStreamingRecording / stopStreamingRecording /
cancelStreamingRecording mit PcmStreamRecorder als AudioRecord-Source
- permanenter RVS-Listener fuer stt_partial / stt_endpoint / stt_stream_done,
Filterung ueber streamRequestId-Match
- Callbacks onSttEndpoint(SttEndpointEvent) + onSttPartial(text)
- No-Speech-Watchdog + App-seitiger Hard-Cap (+2s Toleranz gegen Bridge)
- cancelStreamingRecording feuert onSttEndpoint mit text='' damit
ChatScreen den No-Speech-Fall behandeln kann (wie frueher
onSilenceDetected -> stopRecording() -> null)
- Legacy startRecording / stopRecording / onSilenceDetected unangetastet
-- VoiceButton (manuelle Aufnahme) nutzt das weiterhin
ChatScreen.tsx:
- Wake-Callback: startRecording -> startStreamingRecording
- Bubble wird sofort gebaut, audioRequestId landet via
stt_endpoint -> chat(sender=stt) im chat-Handler-Update-Pfad wie bisher
- onSilenceDetected entfernt, ersetzt durch onSttEndpoint:
text != '' -> log, aria-bridge triggert Brain selbst (Phase-2-Shortcut)
text == '' -> endConversation (No-Speech-Fall)
- Barge-In via Wake-Word: ebenfalls auf Streaming umgestellt
- AppState-resume + toggleWakeWord-off pruefen jetzt isStreamingRecording()
und nutzen passenden Cancel
Damit: kein dB/VAD mehr im Hot-Path. Whisper hoert auf semantische
Stille (kein neuer Text), Brain bekommt den Text direkt von aria-bridge,
Audio-Roundtrip App->aria->whisper->aria->App entfaellt komplett.
This commit is contained in:
@@ -531,7 +531,14 @@ const ChatScreen: React.FC = () => {
|
|||||||
if (bgDur > 30_000) {
|
if (bgDur > 30_000) {
|
||||||
wakeWordService.discardIfFreshlyTriggered(15_000).then(discarded => {
|
wakeWordService.discardIfFreshlyTriggered(15_000).then(discarded => {
|
||||||
if (discarded) {
|
if (discarded) {
|
||||||
try { audioService.cancelRecording(); } catch {}
|
// Sowohl legacy als auch Streaming-Pfad abdecken
|
||||||
|
try {
|
||||||
|
if (audioService.isStreamingRecording()) {
|
||||||
|
audioService.cancelStreamingRecording('wake-discarded');
|
||||||
|
} else {
|
||||||
|
audioService.cancelRecording();
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
}
|
}
|
||||||
}).catch(() => {});
|
}).catch(() => {});
|
||||||
}
|
}
|
||||||
@@ -1266,64 +1273,75 @@ const ChatScreen: React.FC = () => {
|
|||||||
return () => unsubPlayback();
|
return () => unsubPlayback();
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
// Wake Word / Gespraechsmodus: Auto-Aufnahme starten
|
// Wake Word / Gespraechsmodus: Auto-Aufnahme starten (Streaming-Modus)
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
const unsubWake = wakeWordService.onWakeWord(async () => {
|
const unsubWake = wakeWordService.onWakeWord(async () => {
|
||||||
console.log('[Chat] Gespraechsmodus — starte Auto-Aufnahme');
|
console.log('[Chat] Gespraechsmodus — starte Streaming-Aufnahme');
|
||||||
import('../services/logger').then(m => m.reportAppDebug('wake.cb', 'callback fired, calling startRecording')).catch(()=>{});
|
import('../services/logger').then(m => m.reportAppDebug('wake.cb', 'callback fired, calling startStreamingRecording')).catch(()=>{});
|
||||||
// Conversation-Window: User hat X Sekunden um anzufangen, sonst Konversation aus
|
|
||||||
|
// Bubble SOFORT bauen — bevor Whisper-Bridge antwortet — damit der User
|
||||||
|
// sieht "Es passiert was". stt_endpoint kommt typisch <1s spaeter mit
|
||||||
|
// dem finalen Text, dann wird die Bubble ueber audioRequestId-Match
|
||||||
|
// aktualisiert (siehe chat-Handler oben).
|
||||||
|
const audioRequestId = `audio_${Date.now()}_${Math.floor(Math.random() * 100000)}`;
|
||||||
|
const wasInterrupted = interruptAriaIfBusy();
|
||||||
|
const location = await getCurrentLocation();
|
||||||
const windowMs = await loadConvWindowMs();
|
const windowMs = await loadConvWindowMs();
|
||||||
const started = await audioService.startRecording(true, windowMs);
|
|
||||||
import('../services/logger').then(m => m.reportAppDebug('wake.cb', `startRecording returned ${started}`)).catch(()=>{});
|
const userMsg: ChatMessage = {
|
||||||
if (started) {
|
id: nextId(),
|
||||||
// Erst JETZT signalisieren dass das Mikro wirklich offen ist —
|
sender: 'user',
|
||||||
// vorher war's noch in der Init-Phase. So weiss der User exakt
|
text: '🎙 Spracheingabe wird verarbeitet...',
|
||||||
// ab wann er reden kann. "Bereit"-Sound (Ding-Dong) ist optional
|
timestamp: Date.now(),
|
||||||
// ueber Settings → Wake-Word abschaltbar.
|
attachments: [{ type: 'audio', name: 'Sprachaufnahme' }],
|
||||||
|
audioRequestId,
|
||||||
|
};
|
||||||
|
setMessages(prev => capMessages([...prev, userMsg]));
|
||||||
|
|
||||||
|
const { ok } = await audioService.startStreamingRecording({
|
||||||
|
audioRequestId,
|
||||||
|
voice: localXttsVoiceRef.current,
|
||||||
|
speed: ttsSpeedRef.current,
|
||||||
|
interrupted: wasInterrupted,
|
||||||
|
location: location || null,
|
||||||
|
noSpeechTimeoutMs: windowMs,
|
||||||
|
endpointMs: 1500,
|
||||||
|
hardCapMs: 60000,
|
||||||
|
});
|
||||||
|
import('../services/logger').then(m => m.reportAppDebug('wake.cb', `startStreamingRecording returned ok=${ok}`)).catch(()=>{});
|
||||||
|
if (ok) {
|
||||||
ToastAndroid.show('🎤 Mikro offen — sprich jetzt', ToastAndroid.SHORT);
|
ToastAndroid.show('🎤 Mikro offen — sprich jetzt', ToastAndroid.SHORT);
|
||||||
playWakeReadySound().catch(() => {});
|
playWakeReadySound().catch(() => {});
|
||||||
import('../services/logger').then(m => m.reportAppDebug('wake.cb', 'gong played + recording started')).catch(()=>{});
|
scheduleStaleAudioCleanup(audioRequestId, 60000);
|
||||||
|
import('../services/logger').then(m => m.reportAppDebug('wake.cb', 'gong played + streaming started')).catch(()=>{});
|
||||||
} else {
|
} else {
|
||||||
// Mikrofon nicht verfuegbar, naechsten Versuch
|
// Mikrofon nicht verfuegbar → Bubble wieder weg, naechster Versuch
|
||||||
|
setMessages(prev => prev.filter(m => m.audioRequestId !== audioRequestId));
|
||||||
wakeWordService.resume();
|
wakeWordService.resume();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Auto-Stop Callback: wenn Stille erkannt → Aufnahme senden + Wake Word wieder starten
|
// STT-Endpoint-Callback ersetzt den alten onSilenceDetected.
|
||||||
const unsubSilence = audioService.onSilenceDetected(async () => {
|
// Feuert in 2 Faellen:
|
||||||
const result = await audioService.stopRecording();
|
// - text != '' → Whisper-Bridge hat ML-Endpoint erkannt, Text liegt vor.
|
||||||
if (result && result.durationMs > 500) {
|
// aria-bridge bekommt das gleiche Event und triggert Brain
|
||||||
// User hat im Fenster gesprochen → Sprachnachricht senden
|
// direkt. App muss nix mehr senden.
|
||||||
// Barge-In: laufende ARIA-Aktivitaet abbrechen wenn welche da ist.
|
// - text == '' → cancelStreamingRecording (no-speech / hardcap / error).
|
||||||
const wasInterrupted = interruptAriaIfBusy();
|
// Konversation beenden wie frueher der "kein Speech"-Fall.
|
||||||
const location = await getCurrentLocation();
|
const unsubEndpoint = audioService.onSttEndpoint((ev) => {
|
||||||
const audioRequestId = `audio_${Date.now()}_${Math.floor(Math.random() * 100000)}`;
|
if (ev.text && ev.text.trim()) {
|
||||||
const userMsg: ChatMessage = {
|
console.log('[Chat] STT-Endpoint: %r (reason=%s, %dms, %.1fs Audio)',
|
||||||
id: nextId(),
|
ev.text.slice(0, 80), ev.reason, ev.sttMs, ev.durationS);
|
||||||
sender: 'user',
|
// Brain laeuft via aria-bridge — wir warten auf chat(sender=stt) +
|
||||||
text: '🎙 Spracheingabe wird verarbeitet...',
|
// chat(sender=aria) wie im Legacy-Pfad.
|
||||||
timestamp: Date.now(),
|
|
||||||
attachments: [{ type: 'audio', name: 'Sprachaufnahme' }],
|
|
||||||
audioRequestId,
|
|
||||||
};
|
|
||||||
setMessages(prev => capMessages([...prev, userMsg]));
|
|
||||||
rvs.send('audio', {
|
|
||||||
base64: result.base64,
|
|
||||||
durationMs: result.durationMs,
|
|
||||||
mimeType: result.mimeType,
|
|
||||||
voice: localXttsVoiceRef.current,
|
|
||||||
speed: ttsSpeedRef.current,
|
|
||||||
interrupted: wasInterrupted,
|
|
||||||
audioRequestId,
|
|
||||||
...(location && { location }),
|
|
||||||
});
|
|
||||||
scheduleStaleAudioCleanup(audioRequestId, result.durationMs);
|
|
||||||
// resume() wird durch onPlaybackFinished nach ARIAs Antwort getriggert.
|
|
||||||
} else {
|
} else {
|
||||||
// Kein Speech im Window → Konversation beenden (Ohr geht aus oder
|
// Kein Speech im Window → Konversation beenden
|
||||||
// bleibt armed wenn Wake Word verfuegbar)
|
console.log('[Chat] STT-Endpoint ohne Text (reason=%s) — endConversation', ev.reason);
|
||||||
|
// Placeholder-Bubble wieder weg
|
||||||
|
if (ev.audioRequestId) {
|
||||||
|
setMessages(prev => prev.filter(m => m.audioRequestId !== ev.audioRequestId));
|
||||||
|
}
|
||||||
wakeWordService.endConversation();
|
wakeWordService.endConversation();
|
||||||
// UI-State synchron halten
|
|
||||||
if (!wakeWordService.isActive()) setWakeWordActive(false);
|
if (!wakeWordService.isActive()) setWakeWordActive(false);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -1332,17 +1350,42 @@ const ChatScreen: React.FC = () => {
|
|||||||
// Wake-Word-Service hat bei TTS-Start parallel zu lauschen begonnen
|
// Wake-Word-Service hat bei TTS-Start parallel zu lauschen begonnen
|
||||||
// (mit AcousticEchoCanceler damit ARIAs eigene Stimme nicht triggert).
|
// (mit AcousticEchoCanceler damit ARIAs eigene Stimme nicht triggert).
|
||||||
const unsubBarge = wakeWordService.onBargeIn(async () => {
|
const unsubBarge = wakeWordService.onBargeIn(async () => {
|
||||||
console.log('[Chat] Barge-In via Wake-Word — TTS abbrechen + neue Aufnahme');
|
console.log('[Chat] Barge-In via Wake-Word — TTS abbrechen + neue Streaming-Aufnahme');
|
||||||
audioService.haltAllPlayback('barge-in via wake-word');
|
audioService.haltAllPlayback('barge-in via wake-word');
|
||||||
setAgentActivity({ activity: 'idle', tool: '' });
|
setAgentActivity({ activity: 'idle', tool: '' });
|
||||||
rvs.send('cancel_request' as any, {});
|
rvs.send('cancel_request' as any, {});
|
||||||
// Kurze Pause damit halt durchgreift, dann neue Aufnahme starten
|
// Kurze Pause damit halt durchgreift, dann neue Aufnahme starten
|
||||||
await new Promise(r => setTimeout(r, 150));
|
await new Promise(r => setTimeout(r, 150));
|
||||||
|
const audioRequestId = `audio_${Date.now()}_${Math.floor(Math.random() * 100000)}`;
|
||||||
|
const location = await getCurrentLocation();
|
||||||
const windowMs = await loadConvWindowMs();
|
const windowMs = await loadConvWindowMs();
|
||||||
const started = await audioService.startRecording(true, windowMs);
|
|
||||||
if (started) {
|
const userMsg: ChatMessage = {
|
||||||
|
id: nextId(),
|
||||||
|
sender: 'user',
|
||||||
|
text: '🎙 Spracheingabe wird verarbeitet...',
|
||||||
|
timestamp: Date.now(),
|
||||||
|
attachments: [{ type: 'audio', name: 'Sprachaufnahme' }],
|
||||||
|
audioRequestId,
|
||||||
|
};
|
||||||
|
setMessages(prev => capMessages([...prev, userMsg]));
|
||||||
|
|
||||||
|
const { ok } = await audioService.startStreamingRecording({
|
||||||
|
audioRequestId,
|
||||||
|
voice: localXttsVoiceRef.current,
|
||||||
|
speed: ttsSpeedRef.current,
|
||||||
|
interrupted: true, // Barge-In → Brain weiss "User hat unterbrochen"
|
||||||
|
location: location || null,
|
||||||
|
noSpeechTimeoutMs: windowMs,
|
||||||
|
endpointMs: 1500,
|
||||||
|
hardCapMs: 60000,
|
||||||
|
});
|
||||||
|
if (ok) {
|
||||||
ToastAndroid.show('🎤 Mikro offen — sprich jetzt', ToastAndroid.SHORT);
|
ToastAndroid.show('🎤 Mikro offen — sprich jetzt', ToastAndroid.SHORT);
|
||||||
playWakeReadySound().catch(() => {});
|
playWakeReadySound().catch(() => {});
|
||||||
|
scheduleStaleAudioCleanup(audioRequestId, 60000);
|
||||||
|
} else {
|
||||||
|
setMessages(prev => prev.filter(m => m.audioRequestId !== audioRequestId));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -1365,7 +1408,7 @@ const ChatScreen: React.FC = () => {
|
|||||||
|
|
||||||
return () => {
|
return () => {
|
||||||
unsubWake();
|
unsubWake();
|
||||||
unsubSilence();
|
unsubEndpoint();
|
||||||
unsubBarge();
|
unsubBarge();
|
||||||
unsubTtsStart();
|
unsubTtsStart();
|
||||||
unsubTtsEnd();
|
unsubTtsEnd();
|
||||||
@@ -1375,11 +1418,18 @@ const ChatScreen: React.FC = () => {
|
|||||||
// Wake Word Toggle Handler
|
// Wake Word Toggle Handler
|
||||||
const toggleWakeWord = useCallback(async () => {
|
const toggleWakeWord = useCallback(async () => {
|
||||||
if (wakeWordActive) {
|
if (wakeWordActive) {
|
||||||
// Vor Porcupine-Stop: eventuelle laufende Aufnahme abbrechen. Sonst
|
// Vor Wake-Word-Stop: eventuelle laufende Aufnahme abbrechen. Sonst
|
||||||
// bleibt audioService.recordingState=='recording' haengen und der
|
// bleibt audioService.recordingState=='recording' haengen und der
|
||||||
// normale Aufnahme-Button wirkt nicht mehr (startRecording lehnt
|
// normale Aufnahme-Button wirkt nicht mehr (startRecording lehnt
|
||||||
// ab weil "Aufnahme laeuft bereits").
|
// ab weil "Aufnahme laeuft bereits"). Beide Pfade abdecken — legacy
|
||||||
try { await audioService.stopRecording(); } catch {}
|
// file-Aufnahme + neue Streaming-Aufnahme.
|
||||||
|
try {
|
||||||
|
if (audioService.isStreamingRecording()) {
|
||||||
|
await audioService.cancelStreamingRecording('wake-toggle-off');
|
||||||
|
} else {
|
||||||
|
await audioService.stopRecording();
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
await wakeWordService.stop();
|
await wakeWordService.stop();
|
||||||
setWakeWordActive(false);
|
setWakeWordActive(false);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ function btoaSafe(bin: string): string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Native Module fuer Audio-Focus (Ducking/Muten anderer Apps)
|
// Native Module fuer Audio-Focus (Ducking/Muten anderer Apps)
|
||||||
const { AudioFocus, PcmStreamPlayer } = NativeModules as {
|
const { AudioFocus, PcmStreamPlayer, PcmStreamRecorder } = NativeModules as {
|
||||||
AudioFocus?: {
|
AudioFocus?: {
|
||||||
requestDuck: () => Promise<boolean>;
|
requestDuck: () => Promise<boolean>;
|
||||||
requestExclusive: () => Promise<boolean>;
|
requestExclusive: () => Promise<boolean>;
|
||||||
@@ -51,8 +51,15 @@ const { AudioFocus, PcmStreamPlayer } = NativeModules as {
|
|||||||
end: () => Promise<boolean>;
|
end: () => Promise<boolean>;
|
||||||
stop: () => Promise<boolean>;
|
stop: () => Promise<boolean>;
|
||||||
};
|
};
|
||||||
|
PcmStreamRecorder?: {
|
||||||
|
start: () => Promise<boolean>;
|
||||||
|
stop: () => Promise<boolean>;
|
||||||
|
isRecording: () => Promise<boolean>;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
import rvs from './rvs';
|
||||||
|
|
||||||
// --- Typen ---
|
// --- Typen ---
|
||||||
|
|
||||||
export interface RecordingResult {
|
export interface RecordingResult {
|
||||||
@@ -70,6 +77,19 @@ type RecordingStateCallback = (state: RecordingState) => void;
|
|||||||
type MeterCallback = (db: number) => void;
|
type MeterCallback = (db: number) => void;
|
||||||
type SilenceCallback = () => void;
|
type SilenceCallback = () => void;
|
||||||
|
|
||||||
|
/** Endpoint-Event von der Streaming-Whisper-Bridge — finaler Text +
|
||||||
|
* Echo-Felder. ChatScreen reagiert darauf wie frueher auf
|
||||||
|
* onSilenceDetected, nur dass der Text schon da ist. */
|
||||||
|
export interface SttEndpointEvent {
|
||||||
|
audioRequestId: string;
|
||||||
|
text: string;
|
||||||
|
reason: string; // 'endpoint' | 'stream_end' | 'hardcap'
|
||||||
|
durationS: number;
|
||||||
|
sttMs: number;
|
||||||
|
}
|
||||||
|
type SttEndpointCallback = (e: SttEndpointEvent) => void;
|
||||||
|
type SttPartialCallback = (text: string) => void;
|
||||||
|
|
||||||
// --- Konstanten ---
|
// --- Konstanten ---
|
||||||
|
|
||||||
const AUDIO_SAMPLE_RATE = 16000;
|
const AUDIO_SAMPLE_RATE = 16000;
|
||||||
@@ -286,6 +306,26 @@ class AudioService {
|
|||||||
// Position-Berechnen vom playbackStarted abziehen
|
// Position-Berechnen vom playbackStarted abziehen
|
||||||
private readonly LEADING_SILENCE_SEC = 0.3;
|
private readonly LEADING_SILENCE_SEC = 0.3;
|
||||||
|
|
||||||
|
// ── Streaming-STT-Session-State ──
|
||||||
|
// Aktuelle Session-ID (requestId der whisper-bridge). Leer wenn kein Stream
|
||||||
|
// aktiv. Wird beim Eintreffen von Chunks geprueft damit wir nicht versehent-
|
||||||
|
// lich Chunks einer alten Session in eine neue mischen.
|
||||||
|
private streamRequestId: string = '';
|
||||||
|
private streamAudioRequestId: string = '';
|
||||||
|
// Subscriber-Handles fuer Native-Events + RVS-Listener (cleanup beim stop)
|
||||||
|
private streamPcmChunkSub: { remove: () => void } | null = null;
|
||||||
|
private streamPcmErrorSub: { remove: () => void } | null = null;
|
||||||
|
private streamRvsUnsub: (() => void) | null = null;
|
||||||
|
// No-speech-Watchdog: wenn nach N ms noch kein einziger stt_partial kam,
|
||||||
|
// brechen wir die Session ab (Stille → User hat nix gesagt → Konversation
|
||||||
|
// beenden). Ersetzt den alten vad noSpeechTimer.
|
||||||
|
private streamNoSpeechTimer: ReturnType<typeof setTimeout> | null = null;
|
||||||
|
private streamGotPartial: boolean = false;
|
||||||
|
private streamHardCapTimer: ReturnType<typeof setTimeout> | null = null;
|
||||||
|
// Endpoint/Partial-Callbacks fuer ChatScreen
|
||||||
|
private endpointListeners: SttEndpointCallback[] = [];
|
||||||
|
private partialListeners: SttPartialCallback[] = [];
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
this.recorder = new AudioRecorderPlayer();
|
this.recorder = new AudioRecorderPlayer();
|
||||||
this.recorder.setSubscriptionDuration(0.1); // 100ms Metering-Updates
|
this.recorder.setSubscriptionDuration(0.1); // 100ms Metering-Updates
|
||||||
@@ -310,6 +350,60 @@ class AudioService {
|
|||||||
// bleibt liegen. 5min-Threshold damit gerade aktiv geschriebene Files sicher
|
// bleibt liegen. 5min-Threshold damit gerade aktiv geschriebene Files sicher
|
||||||
// sind. cleanupOnStartup ist async, blockt den Constructor nicht.
|
// sind. cleanupOnStartup ist async, blockt den Constructor nicht.
|
||||||
this._cleanupStaleCacheFiles(5 * 60 * 1000).catch(() => {});
|
this._cleanupStaleCacheFiles(5 * 60 * 1000).catch(() => {});
|
||||||
|
|
||||||
|
// RVS-Listener fuer Streaming-STT-Antworten der Whisper-Bridge.
|
||||||
|
// Wir subscribed permanent — gefiltert wird ueber streamRequestId-Match.
|
||||||
|
// Das macht startStreamingRecording einfacher (kein subscribe/unsubscribe
|
||||||
|
// pro Session noetig).
|
||||||
|
try {
|
||||||
|
this.streamRvsUnsub = rvs.onMessage((msg) => {
|
||||||
|
const t = msg?.type;
|
||||||
|
if (t !== 'stt_partial' && t !== 'stt_endpoint' && t !== 'stt_stream_done') return;
|
||||||
|
const p = (msg as any).payload || {};
|
||||||
|
const reqId = String(p.requestId || '');
|
||||||
|
if (!reqId || reqId !== this.streamRequestId) return;
|
||||||
|
if (t === 'stt_partial') {
|
||||||
|
const text = String(p.text || '');
|
||||||
|
this.streamGotPartial = true;
|
||||||
|
// Sobald wir ueberhaupt mal Text gekriegt haben, ist der no-speech
|
||||||
|
// Watchdog erledigt.
|
||||||
|
if (this.streamNoSpeechTimer) {
|
||||||
|
clearTimeout(this.streamNoSpeechTimer);
|
||||||
|
this.streamNoSpeechTimer = null;
|
||||||
|
}
|
||||||
|
this.partialListeners.forEach(cb => {
|
||||||
|
try { cb(text); } catch (e) { console.warn('[Audio] partial listener err:', e); }
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (t === 'stt_endpoint') {
|
||||||
|
const ev: SttEndpointEvent = {
|
||||||
|
audioRequestId: String(p.audioRequestId || ''),
|
||||||
|
text: String(p.text || ''),
|
||||||
|
reason: String(p.reason || ''),
|
||||||
|
durationS: Number(p.durationS || 0),
|
||||||
|
sttMs: Number(p.sttMs || 0),
|
||||||
|
};
|
||||||
|
console.log('[Audio] stt_endpoint: %dms, %.1fs Audio, text=%r',
|
||||||
|
ev.sttMs, ev.durationS, ev.text.slice(0, 80));
|
||||||
|
// Wir stoppen die Aufnahme — whisper hat alles was es braucht.
|
||||||
|
// Kein stt_stream_end senden: das Endpoint kam von der Bridge,
|
||||||
|
// sie hat schon finalisiert.
|
||||||
|
this._cleanupStreamLocal('endpoint');
|
||||||
|
this.endpointListeners.forEach(cb => {
|
||||||
|
try { cb(ev); } catch (e) { console.warn('[Audio] endpoint listener err:', e); }
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (t === 'stt_stream_done') {
|
||||||
|
// Idempotent — falls cleanup nach endpoint schon lief, harmlos.
|
||||||
|
this._cleanupStreamLocal('stream_done');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('[Audio] RVS-Listener-Subscribe fehlgeschlagen:', err);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** AudioFocus mit kleiner Verzoegerung freigeben — Spotify/YouTube
|
/** AudioFocus mit kleiner Verzoegerung freigeben — Spotify/YouTube
|
||||||
@@ -822,6 +916,251 @@ class AudioService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ──────────────────────────────────────────────────────────────
|
||||||
|
// STREAMING-AUFNAHME (Phase 1+2 — PCM live an Whisper-Bridge)
|
||||||
|
// ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/** Startet eine Streaming-STT-Session.
|
||||||
|
*
|
||||||
|
* Statt eine MP4-Datei aufzunehmen und am Ende hochzuladen, oeffnet der
|
||||||
|
* PcmStreamRecorder (16 kHz mono s16le) ein AudioRecord und schickt
|
||||||
|
* alle 200ms einen PCM-Chunk via rvs.send('stt_audio_chunk') an die
|
||||||
|
* whisper-bridge. Diese transkribiert live und feuert stt_endpoint
|
||||||
|
* sobald der erkannte Text fuer endpointMs nicht mehr waechst.
|
||||||
|
*
|
||||||
|
* Auf stt_endpoint reagiert audio.ts indem es PcmStreamRecorder stoppt
|
||||||
|
* und endpointListeners feuert — ChatScreen baut dann die Chat-Bubble.
|
||||||
|
* Den eigentlichen Brain-Call macht aria-bridge direkt nach stt_endpoint,
|
||||||
|
* KEIN Audio-Roundtrip ueber die App noetig.
|
||||||
|
*
|
||||||
|
* Args:
|
||||||
|
* audioRequestId — eindeutige Korrelations-ID fuer die "wird
|
||||||
|
* verarbeitet"-Bubble (gleiche Semantik wie beim
|
||||||
|
* Legacy-Pfad mit rvs.send('audio')).
|
||||||
|
* voice/speed — TTS-Echo-Felder, werden an Brain weitergegeben.
|
||||||
|
* interrupted — true bei Barge-In waehrend ARIA noch sprach.
|
||||||
|
* location — GPS, falls vorhanden.
|
||||||
|
* noSpeechTimeoutMs — wenn nach so vielen ms KEIN stt_partial kam
|
||||||
|
* (= Whisper hat nix erkannt), brechen wir die
|
||||||
|
* Session ab. 0 = kein Watchdog.
|
||||||
|
* endpointMs — Schwellwert fuer Endpoint (Stille = kein neuer
|
||||||
|
* Text). Default 1500ms — Whisper-Bridge nutzt
|
||||||
|
* den Wert wenn mitgesendet.
|
||||||
|
* hardCapMs — Schmerzgrenze. Default 60s.
|
||||||
|
*/
|
||||||
|
async startStreamingRecording(opts: {
|
||||||
|
audioRequestId: string;
|
||||||
|
voice?: string;
|
||||||
|
speed?: number;
|
||||||
|
interrupted?: boolean;
|
||||||
|
location?: any;
|
||||||
|
noSpeechTimeoutMs?: number;
|
||||||
|
endpointMs?: number;
|
||||||
|
hardCapMs?: number;
|
||||||
|
}): Promise<{ requestId: string; ok: boolean }> {
|
||||||
|
if (this.recordingState !== 'idle') {
|
||||||
|
console.warn('[Audio] startStreamingRecording: bereits aktiv (state=%s)', this.recordingState);
|
||||||
|
return { requestId: '', ok: false };
|
||||||
|
}
|
||||||
|
if (!PcmStreamRecorder) {
|
||||||
|
console.warn('[Audio] PcmStreamRecorder Native-Modul nicht verfuegbar');
|
||||||
|
return { requestId: '', ok: false };
|
||||||
|
}
|
||||||
|
const hasPermission = await this.requestMicrophonePermission();
|
||||||
|
if (!hasPermission) {
|
||||||
|
console.warn('[Audio] Keine Mikrofon-Berechtigung');
|
||||||
|
return { requestId: '', ok: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Laufende Wiedergabe stoppen (damit ARIA sich nicht selbst hoert)
|
||||||
|
this.stopPlayback();
|
||||||
|
|
||||||
|
const requestId = `sttstr_${Date.now()}_${Math.floor(Math.random() * 100000)}`;
|
||||||
|
this.streamRequestId = requestId;
|
||||||
|
this.streamAudioRequestId = opts.audioRequestId || '';
|
||||||
|
this.streamGotPartial = false;
|
||||||
|
this.recordingStartTime = Date.now();
|
||||||
|
|
||||||
|
try {
|
||||||
|
await acquireBackgroundAudio('rec');
|
||||||
|
|
||||||
|
// PcmStreamChunk-Subscriber AUFSETZEN BEVOR der Recorder startet —
|
||||||
|
// sonst koennten die ersten 1-2 Chunks ins Leere gehen.
|
||||||
|
try {
|
||||||
|
const emitter = new NativeEventEmitter(NativeModules.PcmStreamRecorder as any);
|
||||||
|
this.streamPcmChunkSub = emitter.addListener('PcmStreamChunk', (e: any) => {
|
||||||
|
// Nur Chunks der aktuellen Session weiterleiten — verhindert dass
|
||||||
|
// ein verspaeteter Chunk in einer neuen Session landet.
|
||||||
|
if (!this.streamRequestId) return;
|
||||||
|
const sessionId = this.streamRequestId;
|
||||||
|
rvs.send('stt_audio_chunk' as any, {
|
||||||
|
requestId: sessionId,
|
||||||
|
pcm: String(e?.pcm || ''),
|
||||||
|
seq: Number(e?.seq || 0),
|
||||||
|
});
|
||||||
|
});
|
||||||
|
this.streamPcmErrorSub = emitter.addListener('PcmStreamError', (e: any) => {
|
||||||
|
console.warn('[Audio] PcmStreamRecorder-Fehler:', e?.error);
|
||||||
|
this._cleanupStreamLocal('pcm-error');
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('[Audio] PcmStreamChunk-Subscription fehlgeschlagen:', err);
|
||||||
|
}
|
||||||
|
|
||||||
|
const started = await PcmStreamRecorder.start();
|
||||||
|
if (!started) {
|
||||||
|
throw new Error('PcmStreamRecorder.start returned false');
|
||||||
|
}
|
||||||
|
|
||||||
|
// AudioFocus exklusiv — gleiche Semantik wie beim Legacy-Pfad.
|
||||||
|
this._cancelDeferredFocusRelease();
|
||||||
|
AudioFocus?.requestExclusive().catch(() => {});
|
||||||
|
|
||||||
|
this.setState('recording');
|
||||||
|
|
||||||
|
// stt_stream_start — der Whisper-Bridge mitteilen dass jetzt Chunks kommen.
|
||||||
|
rvs.send('stt_stream_start' as any, {
|
||||||
|
requestId,
|
||||||
|
audioRequestId: opts.audioRequestId || '',
|
||||||
|
voice: opts.voice || '',
|
||||||
|
speed: typeof opts.speed === 'number' ? opts.speed : 1.0,
|
||||||
|
interrupted: !!opts.interrupted,
|
||||||
|
location: opts.location || null,
|
||||||
|
endpointMs: typeof opts.endpointMs === 'number' ? opts.endpointMs : 1500,
|
||||||
|
hardCapMs: typeof opts.hardCapMs === 'number' ? opts.hardCapMs : 60000,
|
||||||
|
sampleRate: 16000,
|
||||||
|
});
|
||||||
|
|
||||||
|
// No-Speech-Watchdog — ersetzt den alten VAD-noSpeechTimer.
|
||||||
|
// Wenn nach Konversationsfenster kein einziger stt_partial gekommen ist,
|
||||||
|
// hat der User vermutlich nix gesagt → Session beenden.
|
||||||
|
const noSpeechMs = Number(opts.noSpeechTimeoutMs || 0);
|
||||||
|
if (noSpeechMs > 0) {
|
||||||
|
this.streamNoSpeechTimer = setTimeout(() => {
|
||||||
|
if (this.streamRequestId === requestId && !this.streamGotPartial) {
|
||||||
|
console.log('[Audio] Stream %s: no-speech nach %dms → cancel',
|
||||||
|
requestId.slice(0, 12), noSpeechMs);
|
||||||
|
this.cancelStreamingRecording('no-speech').catch(() => {});
|
||||||
|
}
|
||||||
|
}, noSpeechMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hard-Cap als zweite Sicherheitsleine (App-seitig zusaetzlich zur Bridge).
|
||||||
|
const hardCapMs = Number(opts.hardCapMs || 60000);
|
||||||
|
this.streamHardCapTimer = setTimeout(() => {
|
||||||
|
if (this.streamRequestId === requestId) {
|
||||||
|
console.log('[Audio] Stream %s: app-side hardcap %dms erreicht → end',
|
||||||
|
requestId.slice(0, 12), hardCapMs);
|
||||||
|
this.stopStreamingRecording('hardcap').catch(() => {});
|
||||||
|
}
|
||||||
|
}, hardCapMs + 2000); // +2s damit Bridge zuerst feuert wenn moeglich
|
||||||
|
|
||||||
|
console.log('[Audio] Streaming-Aufnahme gestartet (requestId=%s, audioRequestId=%s)',
|
||||||
|
requestId.slice(0, 12), (opts.audioRequestId || '').slice(0, 16));
|
||||||
|
return { requestId, ok: true };
|
||||||
|
} catch (err) {
|
||||||
|
console.error('[Audio] startStreamingRecording fehlgeschlagen:', err);
|
||||||
|
this._cleanupStreamLocal('start-failed');
|
||||||
|
return { requestId: '', ok: false };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Sauberer User-initiated Stop. Sendet stt_stream_end an die Bridge,
|
||||||
|
* die noch ihren Final-Transcribe macht. */
|
||||||
|
async stopStreamingRecording(reason: string = 'user'): Promise<void> {
|
||||||
|
const reqId = this.streamRequestId;
|
||||||
|
if (!reqId) return;
|
||||||
|
try {
|
||||||
|
rvs.send('stt_stream_end' as any, { requestId: reqId, reason });
|
||||||
|
} catch (e) {
|
||||||
|
console.warn('[Audio] stt_stream_end senden fehlgeschlagen:', e);
|
||||||
|
}
|
||||||
|
// Recorder lokal abschalten — Bridge feuert dann ihrerseits noch
|
||||||
|
// stt_endpoint + stt_stream_done.
|
||||||
|
this._cleanupStreamLocal(`stop:${reason}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Abbruch ohne dass Brain den Text verarbeitet — z.B. wenn der User
|
||||||
|
* im Conversation-Window nichts sagt oder cancel drueckt.
|
||||||
|
*
|
||||||
|
* Feuert endpointListeners mit text='' damit ChatScreen den Fall genauso
|
||||||
|
* behandeln kann wie frueher onSilenceDetected→stopRecording()→null:
|
||||||
|
* Konversation beenden, Ohr zurueck auf armed. */
|
||||||
|
async cancelStreamingRecording(reason: string = 'cancel'): Promise<void> {
|
||||||
|
const reqId = this.streamRequestId;
|
||||||
|
if (!reqId) return;
|
||||||
|
const audioReqId = this.streamAudioRequestId;
|
||||||
|
try {
|
||||||
|
rvs.send('stt_stream_end' as any, { requestId: reqId, reason: `cancel:${reason}` });
|
||||||
|
} catch {}
|
||||||
|
this._cleanupStreamLocal(`cancel:${reason}`);
|
||||||
|
// Listener feuern damit ChatScreen reagieren kann (endConversation etc.)
|
||||||
|
const ev: SttEndpointEvent = {
|
||||||
|
audioRequestId: audioReqId,
|
||||||
|
text: '',
|
||||||
|
reason: `cancel:${reason}`,
|
||||||
|
durationS: 0,
|
||||||
|
sttMs: 0,
|
||||||
|
};
|
||||||
|
this.endpointListeners.forEach(cb => {
|
||||||
|
try { cb(ev); } catch (e) { console.warn('[Audio] endpoint listener (cancel) err:', e); }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Nur-lokale Cleanup: PcmStreamRecorder stoppen, Listener entfernen,
|
||||||
|
* AudioFocus freigeben, State zurueck auf idle. Nicht ueber RVS
|
||||||
|
* kommunizieren — Caller hat das schon erledigt (oder eben nicht
|
||||||
|
* noetig wenn Bridge das Endpoint gefeuert hat). */
|
||||||
|
private _cleanupStreamLocal(reason: string): void {
|
||||||
|
if (!this.streamRequestId) return;
|
||||||
|
console.log('[Audio] Stream cleanup (%s)', reason);
|
||||||
|
this.streamRequestId = '';
|
||||||
|
this.streamAudioRequestId = '';
|
||||||
|
this.streamGotPartial = false;
|
||||||
|
if (this.streamNoSpeechTimer) {
|
||||||
|
clearTimeout(this.streamNoSpeechTimer);
|
||||||
|
this.streamNoSpeechTimer = null;
|
||||||
|
}
|
||||||
|
if (this.streamHardCapTimer) {
|
||||||
|
clearTimeout(this.streamHardCapTimer);
|
||||||
|
this.streamHardCapTimer = null;
|
||||||
|
}
|
||||||
|
if (this.streamPcmChunkSub) {
|
||||||
|
try { this.streamPcmChunkSub.remove(); } catch {}
|
||||||
|
this.streamPcmChunkSub = null;
|
||||||
|
}
|
||||||
|
if (this.streamPcmErrorSub) {
|
||||||
|
try { this.streamPcmErrorSub.remove(); } catch {}
|
||||||
|
this.streamPcmErrorSub = null;
|
||||||
|
}
|
||||||
|
PcmStreamRecorder?.stop().catch(() => {});
|
||||||
|
this._releaseFocusDeferred();
|
||||||
|
this.setState('idle');
|
||||||
|
}
|
||||||
|
|
||||||
|
/** True wenn aktuell eine Streaming-Session laeuft. */
|
||||||
|
isStreamingRecording(): boolean {
|
||||||
|
return !!this.streamRequestId;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Subscribe auf stt_endpoint — feuert wenn die Whisper-Bridge erkannt
|
||||||
|
* hat, dass der User fertig gesprochen hat (ML-Endpointer). */
|
||||||
|
onSttEndpoint(callback: SttEndpointCallback): () => void {
|
||||||
|
this.endpointListeners.push(callback);
|
||||||
|
return () => {
|
||||||
|
this.endpointListeners = this.endpointListeners.filter(cb => cb !== callback);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Subscribe auf stt_partial — Live-Transkript-Updates (optional fuer
|
||||||
|
* UI-Feedback in der Voice-Bubble). */
|
||||||
|
onSttPartial(callback: SttPartialCallback): () => void {
|
||||||
|
this.partialListeners.push(callback);
|
||||||
|
return () => {
|
||||||
|
this.partialListeners = this.partialListeners.filter(cb => cb !== callback);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// --- Wiedergabe ---
|
// --- Wiedergabe ---
|
||||||
|
|
||||||
/** Base64-kodiertes Audio in die Queue stellen und abspielen */
|
/** Base64-kodiertes Audio in die Queue stellen und abspielen */
|
||||||
|
|||||||
Reference in New Issue
Block a user