feat: Max-Aufnahmedauer konfigurierbar + Barge-In gibt aria-core Kontext
Max-Aufnahme: Default rauf von 2 auf 5 Minuten, in den App-Settings konfigurierbar zwischen 1 und 30 Minuten (loadMaxRecordingMs aus AsyncStorage, Storage-Key aria_max_recording_sec). Notbremse-Verhalten bleibt: nach Ablauf wird die Aufnahme automatisch beendet und gesendet. Barge-In Kontext: Wenn der User waehrend ARIA noch redet/arbeitet eine neue Sprach- oder Text-Nachricht sendet, geht jetzt ein 'interrupted: true' Flag mit. Bridge praefixed den Text fuer aria-core dann mit: "[Hinweis: Stefan hat dich gerade unterbrochen waehrend du noch gesprochen oder gearbeitet hast. Folgendes ist eine Korrektur, Ergaenzung oder ein Themenwechsel zu deiner letzten Antwort.]" So weiss ARIA dass die neue Message KEINE eigenstaendige Folgefrage ist sondern auf den abgebrochenen Run bezogen. Der User sieht in seinem Chat nur den reinen Text — der Hint geht nur an aria-core. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -505,7 +505,7 @@ const ChatScreen: React.FC = () => {
|
|||||||
if (result && result.durationMs > 500) {
|
if (result && result.durationMs > 500) {
|
||||||
// User hat im Fenster gesprochen → Sprachnachricht senden
|
// User hat im Fenster gesprochen → Sprachnachricht senden
|
||||||
// Barge-In: laufende ARIA-Aktivitaet abbrechen wenn welche da ist.
|
// Barge-In: laufende ARIA-Aktivitaet abbrechen wenn welche da ist.
|
||||||
interruptAriaIfBusy();
|
const wasInterrupted = interruptAriaIfBusy();
|
||||||
const location = await getCurrentLocation();
|
const location = await getCurrentLocation();
|
||||||
const userMsg: ChatMessage = {
|
const userMsg: ChatMessage = {
|
||||||
id: nextId(),
|
id: nextId(),
|
||||||
@@ -521,6 +521,7 @@ const ChatScreen: React.FC = () => {
|
|||||||
mimeType: result.mimeType,
|
mimeType: result.mimeType,
|
||||||
voice: localXttsVoiceRef.current,
|
voice: localXttsVoiceRef.current,
|
||||||
speed: ttsSpeedRef.current,
|
speed: ttsSpeedRef.current,
|
||||||
|
interrupted: wasInterrupted,
|
||||||
...(location && { location }),
|
...(location && { location }),
|
||||||
});
|
});
|
||||||
// resume() wird durch onPlaybackFinished nach ARIAs Antwort getriggert.
|
// resume() wird durch onPlaybackFinished nach ARIAs Antwort getriggert.
|
||||||
@@ -623,6 +624,8 @@ const ChatScreen: React.FC = () => {
|
|||||||
|
|
||||||
setInputText('');
|
setInputText('');
|
||||||
|
|
||||||
|
// Barge-In: laufende ARIA-Aktivitaet abbrechen wenn welche da ist.
|
||||||
|
const wasInterrupted = interruptAriaIfBusy();
|
||||||
const location = await getCurrentLocation();
|
const location = await getCurrentLocation();
|
||||||
|
|
||||||
const userMsg: ChatMessage = {
|
const userMsg: ChatMessage = {
|
||||||
@@ -633,16 +636,17 @@ const ChatScreen: React.FC = () => {
|
|||||||
};
|
};
|
||||||
setMessages(prev => capMessages([...prev, userMsg]));
|
setMessages(prev => capMessages([...prev, userMsg]));
|
||||||
|
|
||||||
console.log('[Chat] sende mit voice=%s speed=%s',
|
console.log('[Chat] sende mit voice=%s speed=%s interrupted=%s',
|
||||||
localXttsVoiceRef.current || '(default)', ttsSpeedRef.current);
|
localXttsVoiceRef.current || '(default)', ttsSpeedRef.current, wasInterrupted);
|
||||||
// An RVS senden — mit geraetelokaler Voice (Bridge nutzt sie fuer die Antwort)
|
// An RVS senden — mit geraetelokaler Voice (Bridge nutzt sie fuer die Antwort)
|
||||||
rvs.send('chat', {
|
rvs.send('chat', {
|
||||||
text,
|
text,
|
||||||
voice: localXttsVoiceRef.current,
|
voice: localXttsVoiceRef.current,
|
||||||
speed: ttsSpeedRef.current,
|
speed: ttsSpeedRef.current,
|
||||||
|
interrupted: wasInterrupted,
|
||||||
...(location && { location }),
|
...(location && { location }),
|
||||||
});
|
});
|
||||||
}, [inputText, getCurrentLocation, pendingAttachments, sendPendingAttachments]);
|
}, [inputText, getCurrentLocation, pendingAttachments, sendPendingAttachments, interruptAriaIfBusy]);
|
||||||
|
|
||||||
// Anfrage abbrechen — sofort lokalen Indicator weg, Bridge triggert doctor --fix
|
// Anfrage abbrechen — sofort lokalen Indicator weg, Bridge triggert doctor --fix
|
||||||
const cancelRequest = useCallback(() => {
|
const cancelRequest = useCallback(() => {
|
||||||
@@ -671,7 +675,7 @@ const ChatScreen: React.FC = () => {
|
|||||||
// Sprachaufnahme abgeschlossen
|
// Sprachaufnahme abgeschlossen
|
||||||
const handleVoiceRecording = useCallback(async (result: RecordingResult) => {
|
const handleVoiceRecording = useCallback(async (result: RecordingResult) => {
|
||||||
// Barge-In: laufende ARIA-Aktivitaet abbrechen falls aktiv.
|
// Barge-In: laufende ARIA-Aktivitaet abbrechen falls aktiv.
|
||||||
interruptAriaIfBusy();
|
const wasInterrupted = interruptAriaIfBusy();
|
||||||
const location = await getCurrentLocation();
|
const location = await getCurrentLocation();
|
||||||
|
|
||||||
const userMsg: ChatMessage = {
|
const userMsg: ChatMessage = {
|
||||||
@@ -688,6 +692,7 @@ const ChatScreen: React.FC = () => {
|
|||||||
mimeType: result.mimeType,
|
mimeType: result.mimeType,
|
||||||
voice: localXttsVoiceRef.current,
|
voice: localXttsVoiceRef.current,
|
||||||
speed: ttsSpeedRef.current,
|
speed: ttsSpeedRef.current,
|
||||||
|
interrupted: wasInterrupted,
|
||||||
...(location && { location }),
|
...(location && { location }),
|
||||||
});
|
});
|
||||||
}, [getCurrentLocation, interruptAriaIfBusy]);
|
}, [getCurrentLocation, interruptAriaIfBusy]);
|
||||||
|
|||||||
@@ -35,6 +35,10 @@ import {
|
|||||||
CONV_WINDOW_MIN_SEC,
|
CONV_WINDOW_MIN_SEC,
|
||||||
CONV_WINDOW_MAX_SEC,
|
CONV_WINDOW_MAX_SEC,
|
||||||
CONV_WINDOW_STORAGE_KEY,
|
CONV_WINDOW_STORAGE_KEY,
|
||||||
|
MAX_RECORDING_DEFAULT_SEC,
|
||||||
|
MAX_RECORDING_MIN_SEC,
|
||||||
|
MAX_RECORDING_MAX_SEC,
|
||||||
|
MAX_RECORDING_STORAGE_KEY,
|
||||||
TTS_SPEED_DEFAULT,
|
TTS_SPEED_DEFAULT,
|
||||||
TTS_SPEED_MIN,
|
TTS_SPEED_MIN,
|
||||||
TTS_SPEED_MAX,
|
TTS_SPEED_MAX,
|
||||||
@@ -102,6 +106,7 @@ const SettingsScreen: React.FC = () => {
|
|||||||
const [ttsPrerollSec, setTtsPrerollSec] = useState<number>(TTS_PREROLL_DEFAULT_SEC);
|
const [ttsPrerollSec, setTtsPrerollSec] = useState<number>(TTS_PREROLL_DEFAULT_SEC);
|
||||||
const [vadSilenceSec, setVadSilenceSec] = useState<number>(VAD_SILENCE_DEFAULT_SEC);
|
const [vadSilenceSec, setVadSilenceSec] = useState<number>(VAD_SILENCE_DEFAULT_SEC);
|
||||||
const [convWindowSec, setConvWindowSec] = useState<number>(CONV_WINDOW_DEFAULT_SEC);
|
const [convWindowSec, setConvWindowSec] = useState<number>(CONV_WINDOW_DEFAULT_SEC);
|
||||||
|
const [maxRecordingSec, setMaxRecordingSec] = useState<number>(MAX_RECORDING_DEFAULT_SEC);
|
||||||
const [ttsSpeed, setTtsSpeed] = useState<number>(TTS_SPEED_DEFAULT);
|
const [ttsSpeed, setTtsSpeed] = useState<number>(TTS_SPEED_DEFAULT);
|
||||||
const [wakeKeyword, setWakeKeyword] = useState<string>(DEFAULT_KEYWORD);
|
const [wakeKeyword, setWakeKeyword] = useState<string>(DEFAULT_KEYWORD);
|
||||||
const [wakeStatus, setWakeStatus] = useState<string>('');
|
const [wakeStatus, setWakeStatus] = useState<string>('');
|
||||||
@@ -156,6 +161,14 @@ const SettingsScreen: React.FC = () => {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
AsyncStorage.getItem(MAX_RECORDING_STORAGE_KEY).then(saved => {
|
||||||
|
if (saved != null) {
|
||||||
|
const n = parseFloat(saved);
|
||||||
|
if (isFinite(n) && n >= MAX_RECORDING_MIN_SEC && n <= MAX_RECORDING_MAX_SEC) {
|
||||||
|
setMaxRecordingSec(n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
AsyncStorage.getItem(TTS_SPEED_STORAGE_KEY).then(saved => {
|
AsyncStorage.getItem(TTS_SPEED_STORAGE_KEY).then(saved => {
|
||||||
if (saved != null) {
|
if (saved != null) {
|
||||||
const n = parseFloat(saved);
|
const n = parseFloat(saved);
|
||||||
@@ -671,6 +684,38 @@ const SettingsScreen: React.FC = () => {
|
|||||||
<Text style={styles.prerollButtonText}>+1</Text>
|
<Text style={styles.prerollButtonText}>+1</Text>
|
||||||
</TouchableOpacity>
|
</TouchableOpacity>
|
||||||
</View>
|
</View>
|
||||||
|
|
||||||
|
<Text style={[styles.toggleLabel, {marginTop: 24}]}>Maximale Aufnahmedauer</Text>
|
||||||
|
<Text style={styles.toggleHint}>
|
||||||
|
Notbremse: nach so vielen Minuten wird die Aufnahme automatisch beendet,
|
||||||
|
auch wenn keine Stille erkannt wurde. Nuetzlich fuer lange Erklaerungen
|
||||||
|
oder Diktate. Default: {Math.round(MAX_RECORDING_DEFAULT_SEC / 60)} Min, max {Math.round(MAX_RECORDING_MAX_SEC / 60)} Min.
|
||||||
|
</Text>
|
||||||
|
<View style={styles.prerollRow}>
|
||||||
|
<TouchableOpacity
|
||||||
|
style={styles.prerollButton}
|
||||||
|
onPress={() => {
|
||||||
|
const next = Math.max(MAX_RECORDING_MIN_SEC, maxRecordingSec - 60);
|
||||||
|
setMaxRecordingSec(next);
|
||||||
|
AsyncStorage.setItem(MAX_RECORDING_STORAGE_KEY, String(next));
|
||||||
|
}}
|
||||||
|
disabled={maxRecordingSec <= MAX_RECORDING_MIN_SEC}
|
||||||
|
>
|
||||||
|
<Text style={styles.prerollButtonText}>−1m</Text>
|
||||||
|
</TouchableOpacity>
|
||||||
|
<Text style={styles.prerollValue}>{Math.round(maxRecordingSec / 60)} min</Text>
|
||||||
|
<TouchableOpacity
|
||||||
|
style={styles.prerollButton}
|
||||||
|
onPress={() => {
|
||||||
|
const next = Math.min(MAX_RECORDING_MAX_SEC, maxRecordingSec + 60);
|
||||||
|
setMaxRecordingSec(next);
|
||||||
|
AsyncStorage.setItem(MAX_RECORDING_STORAGE_KEY, String(next));
|
||||||
|
}}
|
||||||
|
disabled={maxRecordingSec >= MAX_RECORDING_MAX_SEC}
|
||||||
|
>
|
||||||
|
<Text style={styles.prerollButtonText}>+1m</Text>
|
||||||
|
</TouchableOpacity>
|
||||||
|
</View>
|
||||||
</View>
|
</View>
|
||||||
|
|
||||||
{/* === Wake-Word (komplett on-device, openWakeWord) === */}
|
{/* === Wake-Word (komplett on-device, openWakeWord) === */}
|
||||||
|
|||||||
@@ -145,7 +145,24 @@ async function loadVadSilenceMs(): Promise<number> {
|
|||||||
|
|
||||||
// Max-Dauer einer Aufnahme (Notbremse gegen Runaway-Loops). Auf 2 Minuten
|
// Max-Dauer einer Aufnahme (Notbremse gegen Runaway-Loops). Auf 2 Minuten
|
||||||
// hochgezogen damit auch laengere Erklaerungen durchgehen.
|
// hochgezogen damit auch laengere Erklaerungen durchgehen.
|
||||||
const MAX_RECORDING_MS = 120000;
|
// Default 5 Minuten — konfigurierbar in den App-Settings (1-30 Minuten).
|
||||||
|
export const MAX_RECORDING_DEFAULT_SEC = 300;
|
||||||
|
export const MAX_RECORDING_MIN_SEC = 60;
|
||||||
|
export const MAX_RECORDING_MAX_SEC = 1800;
|
||||||
|
export const MAX_RECORDING_STORAGE_KEY = 'aria_max_recording_sec';
|
||||||
|
|
||||||
|
export async function loadMaxRecordingMs(): Promise<number> {
|
||||||
|
try {
|
||||||
|
const raw = await AsyncStorage.getItem(MAX_RECORDING_STORAGE_KEY);
|
||||||
|
if (raw != null) {
|
||||||
|
const n = parseFloat(raw);
|
||||||
|
if (isFinite(n) && n >= MAX_RECORDING_MIN_SEC && n <= MAX_RECORDING_MAX_SEC) {
|
||||||
|
return Math.round(n * 1000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
return MAX_RECORDING_DEFAULT_SEC * 1000;
|
||||||
|
}
|
||||||
|
|
||||||
// Pre-Roll: Wie lange Audio im AudioTrack-Buffer liegt bevor play() startet.
|
// Pre-Roll: Wie lange Audio im AudioTrack-Buffer liegt bevor play() startet.
|
||||||
// Einstellbar via Diagnostic/Settings (Key: aria_tts_preroll_sec).
|
// Einstellbar via Diagnostic/Settings (Key: aria_tts_preroll_sec).
|
||||||
@@ -440,18 +457,19 @@ class AudioService {
|
|||||||
};
|
};
|
||||||
if (autoStop) {
|
if (autoStop) {
|
||||||
const vadSilenceMs = await loadVadSilenceMs();
|
const vadSilenceMs = await loadVadSilenceMs();
|
||||||
|
const maxRecordingMs = await loadMaxRecordingMs();
|
||||||
console.log('[Audio] startRecording: autoStop=true, VAD-Stille=%dms, MAX=%dms',
|
console.log('[Audio] startRecording: autoStop=true, VAD-Stille=%dms, MAX=%dms',
|
||||||
vadSilenceMs, MAX_RECORDING_MS);
|
vadSilenceMs, maxRecordingMs);
|
||||||
this.vadTimer = setInterval(() => {
|
this.vadTimer = setInterval(() => {
|
||||||
const silenceDuration = Date.now() - this.lastSpeechTime;
|
const silenceDuration = Date.now() - this.lastSpeechTime;
|
||||||
if (silenceDuration >= vadSilenceMs) {
|
if (silenceDuration >= vadSilenceMs) {
|
||||||
fireSilenceOnce(`VAD ${silenceDuration}ms Stille (Schwelle=${vadSilenceMs}ms)`);
|
fireSilenceOnce(`VAD ${silenceDuration}ms Stille (Schwelle=${vadSilenceMs}ms)`);
|
||||||
}
|
}
|
||||||
}, 200);
|
}, 200);
|
||||||
// Notbremse: Nach MAX_RECORDING_MS zwangsweise stoppen
|
// Notbremse: Nach maxRecordingMs zwangsweise stoppen
|
||||||
this.maxDurationTimer = setTimeout(() => {
|
this.maxDurationTimer = setTimeout(() => {
|
||||||
fireSilenceOnce(`Max-Dauer ${MAX_RECORDING_MS}ms`);
|
fireSilenceOnce(`Max-Dauer ${maxRecordingMs}ms`);
|
||||||
}, MAX_RECORDING_MS);
|
}, maxRecordingMs);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Conversation-Window: Wenn der User innerhalb noSpeechTimeoutMs nicht
|
// Conversation-Window: Wenn der User innerhalb noSpeechTimeoutMs nicht
|
||||||
|
|||||||
+33
-8
@@ -1235,6 +1235,7 @@ class ARIABridge:
|
|||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
self._next_speed_override = None
|
self._next_speed_override = None
|
||||||
if text:
|
if text:
|
||||||
|
interrupted = bool(payload.get("interrupted", False))
|
||||||
# Wenn Files gerade gepuffert sind (Bild + Text gleichzeitig
|
# Wenn Files gerade gepuffert sind (Bild + Text gleichzeitig
|
||||||
# gesendet), mergen wir sie zu einer einzigen Anfrage statt
|
# gesendet), mergen wir sie zu einer einzigen Anfrage statt
|
||||||
# zwei separater send_to_core-Calls.
|
# zwei separater send_to_core-Calls.
|
||||||
@@ -1242,8 +1243,16 @@ class ARIABridge:
|
|||||||
if merged:
|
if merged:
|
||||||
logger.info("[rvs] App-Chat (mit Anhaengen): '%s'", text[:80])
|
logger.info("[rvs] App-Chat (mit Anhaengen): '%s'", text[:80])
|
||||||
else:
|
else:
|
||||||
logger.info("[rvs] App-Chat: '%s'", text[:80])
|
core_text = (
|
||||||
await self.send_to_core(text, source="app")
|
f"[Hinweis: Stefan hat dich gerade unterbrochen waehrend du noch "
|
||||||
|
f"gesprochen oder gearbeitet hast. Folgendes ist eine Korrektur, "
|
||||||
|
f"Ergaenzung oder ein Themenwechsel zu deiner letzten Antwort.] "
|
||||||
|
f"{text}"
|
||||||
|
if interrupted else text
|
||||||
|
)
|
||||||
|
logger.info("[rvs] App-Chat%s: '%s'",
|
||||||
|
" [BARGE-IN]" if interrupted else "", text[:80])
|
||||||
|
await self.send_to_core(core_text, source="app" + (" [barge-in]" if interrupted else ""))
|
||||||
return
|
return
|
||||||
|
|
||||||
if msg_type == "cancel_request":
|
if msg_type == "cancel_request":
|
||||||
@@ -1500,9 +1509,11 @@ class ARIABridge:
|
|||||||
self._next_speed_override = speed if 0.1 <= speed <= 5.0 else None
|
self._next_speed_override = speed if 0.1 <= speed <= 5.0 else None
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
self._next_speed_override = None
|
self._next_speed_override = None
|
||||||
logger.info("[rvs] Audio empfangen: %s, %dms, %dKB",
|
interrupted = bool(payload.get("interrupted", False))
|
||||||
mime_type, duration_ms, len(audio_b64) // 1365)
|
logger.info("[rvs] Audio empfangen: %s, %dms, %dKB%s",
|
||||||
asyncio.create_task(self._process_app_audio(audio_b64, mime_type))
|
mime_type, duration_ms, len(audio_b64) // 1365,
|
||||||
|
" [BARGE-IN]" if interrupted else "")
|
||||||
|
asyncio.create_task(self._process_app_audio(audio_b64, mime_type, interrupted))
|
||||||
|
|
||||||
elif msg_type == "stt_response":
|
elif msg_type == "stt_response":
|
||||||
# Antwort der whisper-bridge auf unseren stt_request
|
# Antwort der whisper-bridge auf unseren stt_request
|
||||||
@@ -1558,8 +1569,13 @@ class ARIABridge:
|
|||||||
_STT_REMOTE_TIMEOUT_READY_S = 45.0
|
_STT_REMOTE_TIMEOUT_READY_S = 45.0
|
||||||
_STT_REMOTE_TIMEOUT_LOADING_S = 300.0
|
_STT_REMOTE_TIMEOUT_LOADING_S = 300.0
|
||||||
|
|
||||||
async def _process_app_audio(self, audio_b64: str, mime_type: str) -> None:
|
async def _process_app_audio(self, audio_b64: str, mime_type: str, interrupted: bool = False) -> None:
|
||||||
"""App-Audio → STT → aria-core. Primaer via whisper-bridge (RVS), Fallback lokal."""
|
"""App-Audio → STT → aria-core. Primaer via whisper-bridge (RVS), Fallback lokal.
|
||||||
|
|
||||||
|
interrupted=True wenn der User waehrend ARIA noch sprach/dachte aufgenommen hat
|
||||||
|
(Barge-In). Wird als Hinweis-Praefix an aria-core mitgegeben damit ARIA die
|
||||||
|
Korrektur/Unterbrechung in den Kontext einordnen kann statt als reine
|
||||||
|
Folgefrage zu behandeln."""
|
||||||
# Erst Remote versuchen
|
# Erst Remote versuchen
|
||||||
text = await self._stt_remote(audio_b64, mime_type)
|
text = await self._stt_remote(audio_b64, mime_type)
|
||||||
if text is None:
|
if text is None:
|
||||||
@@ -1571,8 +1587,17 @@ class ARIABridge:
|
|||||||
|
|
||||||
if text.strip():
|
if text.strip():
|
||||||
logger.info("[rvs] STT Ergebnis: '%s'", text[:80])
|
logger.info("[rvs] STT Ergebnis: '%s'", text[:80])
|
||||||
|
# Barge-In-Hinweis: gibt ARIA den Kontext dass sie unterbrochen wurde
|
||||||
|
# und dies eine Korrektur/Aenderung der vorherigen Anweisung sein kann.
|
||||||
|
core_text = (
|
||||||
|
f"[Hinweis: Stefan hat dich gerade unterbrochen waehrend du noch "
|
||||||
|
f"gesprochen oder gearbeitet hast. Folgendes ist eine Korrektur, "
|
||||||
|
f"Ergaenzung oder ein Themenwechsel zu deiner letzten Antwort.] "
|
||||||
|
f"{text}"
|
||||||
|
if interrupted else text
|
||||||
|
)
|
||||||
# ERST an aria-core senden (wichtigster Schritt)
|
# ERST an aria-core senden (wichtigster Schritt)
|
||||||
await self.send_to_core(text, source="app-voice")
|
await self.send_to_core(core_text, source="app-voice" + (" [barge-in]" if interrupted else ""))
|
||||||
# STT-Text an RVS senden (fuer Anzeige in App + Diagnostic)
|
# STT-Text an RVS senden (fuer Anzeige in App + Diagnostic)
|
||||||
# sender="stt" damit Bridge es ignoriert (kein Loop)
|
# sender="stt" damit Bridge es ignoriert (kein Loop)
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user