added audio workword, and recording, editied readme

This commit is contained in:
duffyduck 2026-03-29 11:29:15 +02:00
parent b687f790ba
commit dbd97d3cf4
15 changed files with 912 additions and 798 deletions

1139
README.md

File diff suppressed because it is too large Load Diff

View File

@ -2,6 +2,7 @@
<uses-permission android:name="android.permission.INTERNET" />
<uses-permission android:name="android.permission.CAMERA" />
<uses-permission android:name="android.permission.RECORD_AUDIO" />
<application
android:name=".MainApplication"

View File

@ -23,7 +23,9 @@
"react-native-permissions": "^4.1.4",
"react-native-camera-kit": "^13.0.0",
"@react-native-async-storage/async-storage": "^1.21.0",
"react-native-fs": "^2.20.0"
"react-native-fs": "^2.20.0",
"react-native-audio-recorder-player": "^3.6.7",
"react-native-live-audio-stream": "^1.3.5"
},
"devDependencies": {
"typescript": "^5.3.3",

View File

@ -1,18 +1,23 @@
/**
* VoiceButton - Push-to-Talk Aufnahmeknopf
* VoiceButton - Push-to-Talk + Auto-Stop Aufnahmeknopf
*
* Zwei Modi:
* 1. Push-to-Talk: gedrueckt halten zum Aufnehmen, loslassen zum Senden
* 2. Tap-to-Talk: einmal tippen startet Aufnahme, VAD stoppt automatisch bei Stille
* (auch genutzt fuer Wake-Word-getriggerte Aufnahme)
*
* Grosser runder Button: gedrueckt halten zum Aufnehmen, loslassen zum Senden.
* Visuelles Feedback durch pulsierende Animation waehrend der Aufnahme.
*/
import React, { useState, useRef, useEffect } from 'react';
import React, { useState, useRef, useEffect, useCallback } from 'react';
import {
View,
Text,
Animated,
StyleSheet,
GestureResponderEvent,
Easing,
TouchableOpacity,
Pressable,
} from 'react-native';
import audioService, { RecordingResult } from '../services/audio';
@ -23,15 +28,23 @@ interface VoiceButtonProps {
onRecordingComplete: (result: RecordingResult) => void;
/** Button deaktivieren */
disabled?: boolean;
/** Wake-Word-Modus aktiv (zeigt Indikator) */
wakeWordActive?: boolean;
}
// --- Komponente ---
const VoiceButton: React.FC<VoiceButtonProps> = ({ onRecordingComplete, disabled = false }) => {
const VoiceButton: React.FC<VoiceButtonProps> = ({
onRecordingComplete,
disabled = false,
wakeWordActive = false,
}) => {
const [isRecording, setIsRecording] = useState(false);
const [durationMs, setDurationMs] = useState(0);
const [meterDb, setMeterDb] = useState(-160);
const pulseAnim = useRef(new Animated.Value(1)).current;
const durationTimer = useRef<ReturnType<typeof setInterval> | null>(null);
const isLongPress = useRef(false);
// Puls-Animation starten/stoppen
useEffect(() => {
@ -59,53 +72,111 @@ const VoiceButton: React.FC<VoiceButtonProps> = ({ onRecordingComplete, disabled
}
}, [isRecording, pulseAnim]);
// Aufnahmedauer zaehlen
// Aufnahmedauer zaehlen + Metering
useEffect(() => {
if (isRecording) {
setDurationMs(0);
durationTimer.current = setInterval(() => {
setDurationMs(prev => prev + 100);
}, 100);
const unsubMeter = audioService.onMeterUpdate(setMeterDb);
return () => {
unsubMeter();
if (durationTimer.current) clearInterval(durationTimer.current);
};
} else {
if (durationTimer.current) {
clearInterval(durationTimer.current);
durationTimer.current = null;
}
}
return () => {
if (durationTimer.current) {
clearInterval(durationTimer.current);
}
};
}, [isRecording]);
const handlePressIn = async (_event: GestureResponderEvent) => {
if (disabled) return;
const started = await audioService.startRecording();
// VAD Silence Callback — Auto-Stop
useEffect(() => {
const unsubSilence = audioService.onSilenceDetected(async () => {
if (!isRecording) return;
setIsRecording(false);
const result = await audioService.stopRecording();
if (result && result.durationMs > 500) {
onRecordingComplete(result);
}
});
return unsubSilence;
}, [isRecording, onRecordingComplete]);
// Auto-Start fuer Wake Word (extern getriggert)
const startAutoRecording = useCallback(async () => {
if (disabled || isRecording) return;
const started = await audioService.startRecording(true); // autoStop = true
if (started) {
isLongPress.current = false;
setIsRecording(true);
}
}, [disabled, isRecording]);
// Push-to-Talk: Lang druecken
const handlePressIn = async () => {
if (disabled || isRecording) return;
isLongPress.current = true;
const started = await audioService.startRecording(false); // kein autoStop
if (started) {
setIsRecording(true);
}
};
const handlePressOut = async (_event: GestureResponderEvent) => {
if (!isRecording) return;
const handlePressOut = async () => {
if (!isRecording || !isLongPress.current) return;
isLongPress.current = false;
setIsRecording(false);
const result = await audioService.stopRecording();
if (result && result.durationMs > 300) {
// Nur senden wenn laenger als 300ms (versehentliches Tippen vermeiden)
onRecordingComplete(result);
}
};
// Tap-to-Talk: Einmal tippen startet mit Auto-Stop
const handleTap = async () => {
if (disabled) return;
if (isRecording) {
// Aufnahme manuell stoppen
setIsRecording(false);
const result = await audioService.stopRecording();
if (result && result.durationMs > 300) {
onRecordingComplete(result);
}
} else {
// Aufnahme mit Auto-Stop starten
const started = await audioService.startRecording(true);
if (started) {
isLongPress.current = false;
setIsRecording(true);
}
}
};
// Expose startAutoRecording via ref fuer Wake Word
React.useImperativeHandle(
React.createRef(),
() => ({ startAutoRecording }),
[startAutoRecording],
);
const formatDuration = (ms: number): string => {
const seconds = Math.floor(ms / 1000);
const tenths = Math.floor((ms % 1000) / 100);
return `${seconds}.${tenths}s`;
};
// Meter-Visualisierung (0-1 Skala)
const meterLevel = Math.max(0, Math.min(1, (meterDb + 60) / 60));
return (
<View style={styles.container}>
{wakeWordActive && !isRecording && (
<View style={styles.wakeWordDot} />
)}
<Animated.View
style={[
styles.buttonOuter,
@ -117,17 +188,28 @@ const VoiceButton: React.FC<VoiceButtonProps> = ({ onRecordingComplete, disabled
onResponderRelease={handlePressOut}
onResponderTerminate={handlePressOut}
>
<View style={[styles.buttonInner, isRecording && styles.buttonInnerRecording]}>
<TouchableOpacity
activeOpacity={0.8}
onPress={handleTap}
disabled={disabled}
style={[styles.buttonInner, isRecording && styles.buttonInnerRecording]}
>
<Text style={styles.buttonIcon}>{isRecording ? '⏹' : '🎙'}</Text>
</View>
</TouchableOpacity>
</Animated.View>
{isRecording && (
<Text style={styles.durationText}>{formatDuration(durationMs)}</Text>
<View style={styles.infoRow}>
<View style={[styles.meterBar, { width: `${meterLevel * 100}%` }]} />
<Text style={styles.durationText}>{formatDuration(durationMs)}</Text>
</View>
)}
</View>
);
};
// Expose startAutoRecording fuer externe Aufrufe (Wake Word)
export type VoiceButtonHandle = { startAutoRecording: () => Promise<void> };
// --- Styles ---
const styles = StyleSheet.create({
@ -135,6 +217,16 @@ const styles = StyleSheet.create({
alignItems: 'center',
justifyContent: 'center',
},
wakeWordDot: {
position: 'absolute',
top: -4,
right: -4,
width: 10,
height: 10,
borderRadius: 5,
backgroundColor: '#34C759',
zIndex: 10,
},
buttonOuter: {
width: 64,
height: 64,
@ -165,10 +257,20 @@ const styles = StyleSheet.create({
buttonIcon: {
fontSize: 24,
},
infoRow: {
alignItems: 'center',
marginTop: 4,
width: 80,
},
meterBar: {
height: 3,
backgroundColor: '#FF3B30',
borderRadius: 2,
marginBottom: 2,
},
durationText: {
color: '#FF3B30',
fontSize: 12,
marginTop: 4,
fontVariant: ['tabular-nums'],
},
});

View File

@ -20,6 +20,7 @@ import {
import AsyncStorage from '@react-native-async-storage/async-storage';
import rvs, { RVSMessage, ConnectionState } from '../services/rvs';
import audioService from '../services/audio';
import wakeWordService from '../services/wakeword';
import VoiceButton from '../components/VoiceButton';
import FileUpload, { FileData } from '../components/FileUpload';
import CameraUpload, { PhotoData } from '../components/CameraUpload';
@ -56,6 +57,7 @@ const ChatScreen: React.FC = () => {
const [showFileUpload, setShowFileUpload] = useState(false);
const [showCameraUpload, setShowCameraUpload] = useState(false);
const [gpsEnabled, setGpsEnabled] = useState(false);
const [wakeWordActive, setWakeWordActive] = useState(false);
const flatListRef = useRef<FlatList>(null);
const messageIdCounter = useRef(0);
@ -134,6 +136,62 @@ const ChatScreen: React.FC = () => {
};
}, []);
// Wake Word: "ARIA" Erkennung → Auto-Aufnahme starten
useEffect(() => {
const unsubWake = wakeWordService.onWakeWord(async () => {
console.log('[Chat] Wake Word erkannt — starte Auto-Aufnahme');
// TTS stoppen damit ARIA sich nicht selbst hoert
audioService.stopPlayback();
// Aufnahme mit Auto-Stop (VAD) starten
const started = await audioService.startRecording(true);
if (!started) {
// Mikrofon nicht verfuegbar, Wake Word wieder aktivieren
wakeWordService.resume();
}
});
// Auto-Stop Callback: wenn Stille erkannt → Aufnahme senden + Wake Word wieder starten
const unsubSilence = audioService.onSilenceDetected(async () => {
const result = await audioService.stopRecording();
if (result && result.durationMs > 500) {
// Sprachnachricht senden (gleiche Logik wie handleVoiceRecording)
const location = await getCurrentLocation();
const userMsg: ChatMessage = {
id: nextId(),
sender: 'user',
text: '[Sprachnachricht]',
timestamp: Date.now(),
attachments: [{ type: 'audio', name: 'Sprachaufnahme' }],
};
setMessages(prev => [...prev, userMsg]);
rvs.send('audio', {
base64: result.base64,
durationMs: result.durationMs,
mimeType: result.mimeType,
...(location && { location }),
});
}
// Wake Word wieder aktivieren
if (wakeWordActive) wakeWordService.resume();
});
return () => {
unsubWake();
unsubSilence();
};
}, [wakeWordActive]);
// Wake Word Toggle Handler
const toggleWakeWord = useCallback(async () => {
if (wakeWordActive) {
wakeWordService.stop();
setWakeWordActive(false);
} else {
const started = await wakeWordService.start();
setWakeWordActive(started);
}
}, [wakeWordActive]);
// Chat-Verlauf in AsyncStorage speichern (letzte N Nachrichten)
useEffect(() => {
if (messages.length === 0) return;
@ -366,7 +424,14 @@ const ChatScreen: React.FC = () => {
<VoiceButton
onRecordingComplete={handleVoiceRecording}
disabled={connectionState !== 'connected'}
wakeWordActive={wakeWordActive}
/>
<TouchableOpacity
style={[styles.wakeWordBtn, wakeWordActive && styles.wakeWordBtnActive]}
onPress={toggleWakeWord}
>
<Text style={styles.wakeWordIcon}>{wakeWordActive ? '👂' : '🔇'}</Text>
</TouchableOpacity>
)}
</View>
@ -530,6 +595,21 @@ const styles = StyleSheet.create({
sendIcon: {
fontSize: 18,
},
wakeWordBtn: {
width: 32,
height: 32,
borderRadius: 16,
backgroundColor: 'rgba(255,255,255,0.1)',
alignItems: 'center',
justifyContent: 'center',
marginLeft: 4,
},
wakeWordBtnActive: {
backgroundColor: 'rgba(52, 199, 89, 0.3)',
},
wakeWordIcon: {
fontSize: 16,
},
modalOverlay: {
flex: 1,
backgroundColor: 'rgba(0,0,0,0.6)',

View File

@ -1,13 +1,20 @@
/**
* Audio-Service fuer Sprach-Ein-/Ausgabe
*
* Verwaltet Mikrofon-Aufnahme und TTS-Audiowiedergabe.
* Nutzt react-native-sound und die nativen Audio-APIs.
* Verwaltet Mikrofon-Aufnahme (mit VAD/Auto-Stop bei Stille),
* TTS-Audiowiedergabe und Metering fuer visuelle Feedback.
* Nutzt react-native-audio-recorder-player fuer Aufnahme.
*/
import { Platform, PermissionsAndroid } from 'react-native';
import Sound from 'react-native-sound';
import RNFS from 'react-native-fs';
import AudioRecorderPlayer, {
AudioEncoderAndroidType,
AudioSourceAndroidType,
AVEncodingOption,
OutputFormatAndroidType,
} from 'react-native-audio-recorder-player';
// --- Typen ---
@ -23,6 +30,8 @@ export interface RecordingResult {
export type RecordingState = 'idle' | 'recording' | 'processing';
type RecordingStateCallback = (state: RecordingState) => void;
type MeterCallback = (db: number) => void;
type SilenceCallback = () => void;
// --- Konstanten ---
@ -30,17 +39,34 @@ const AUDIO_SAMPLE_RATE = 16000;
const AUDIO_CHANNELS = 1;
const AUDIO_ENCODING = 'audio/wav';
// VAD (Voice Activity Detection) — Stille-Erkennung
const VAD_SILENCE_THRESHOLD_DB = -45; // dB unter dem als "Stille" gilt
const VAD_SILENCE_DURATION_MS = 1800; // ms Stille bevor Auto-Stop
// --- Audio-Service ---
class AudioService {
private recordingState: RecordingState = 'idle';
private recordingStartTime: number = 0;
private stateListeners: RecordingStateCallback[] = [];
private meterListeners: MeterCallback[] = [];
private silenceListeners: SilenceCallback[] = [];
private currentSound: Sound | null = null;
private recorder: AudioRecorderPlayer;
private recordingPath: string = '';
// VAD State
private vadEnabled: boolean = false;
private lastSpeechTime: number = 0;
private vadTimer: ReturnType<typeof setInterval> | null = null;
constructor() {
this.recorder = new AudioRecorderPlayer();
this.recorder.setSubscriptionDuration(0.1); // 100ms Metering-Updates
}
// --- Berechtigungen ---
/** Mikrofon-Berechtigung anfordern */
async requestMicrophonePermission(): Promise<boolean> {
if (Platform.OS !== 'android') {
return true;
@ -66,7 +92,7 @@ class AudioService {
// --- Aufnahme ---
/** Mikrofon-Aufnahme starten */
async startRecording(): Promise<boolean> {
async startRecording(autoStop: boolean = false): Promise<boolean> {
if (this.recordingState !== 'idle') {
console.warn('[Audio] Aufnahme laeuft bereits');
return false;
@ -79,11 +105,48 @@ class AudioService {
}
try {
// Nativer Aufnahme-Start ueber AudioRecorder-Bridge
// In Produktion: Native Module oder react-native-audio-recorder-player nutzen
// Laufende Wiedergabe stoppen (damit ARIA sich nicht selbst hoert)
this.stopPlayback();
this.recordingPath = `${RNFS.CachesDirectoryPath}/aria_recording_${Date.now()}.mp4`;
// Aufnahme mit Metering starten
await this.recorder.startRecorder(this.recordingPath, {
AudioEncoderAndroid: AudioEncoderAndroidType.AAC,
AudioSourceAndroid: AudioSourceAndroidType.MIC,
OutputFormatAndroid: OutputFormatAndroidType.MPEG_4,
}, true); // meteringEnabled = true
// Metering-Callback
this.recorder.addRecordBackListener((e) => {
const db = e.currentMetering ?? -160;
this.meterListeners.forEach(cb => cb(db));
// VAD: Stille erkennen
if (this.vadEnabled) {
if (db > VAD_SILENCE_THRESHOLD_DB) {
this.lastSpeechTime = Date.now();
}
}
});
this.recordingStartTime = Date.now();
this.lastSpeechTime = Date.now();
this.setState('recording');
console.log('[Audio] Aufnahme gestartet');
// VAD aktivieren
this.vadEnabled = autoStop;
if (autoStop) {
this.vadTimer = setInterval(() => {
const silenceDuration = Date.now() - this.lastSpeechTime;
if (silenceDuration >= VAD_SILENCE_DURATION_MS) {
console.log(`[Audio] VAD: ${silenceDuration}ms Stille — Auto-Stop`);
this.silenceListeners.forEach(cb => cb());
}
}, 200);
}
console.log('[Audio] Aufnahme gestartet (autoStop: %s)', autoStop);
return true;
} catch (err) {
console.error('[Audio] Fehler beim Starten der Aufnahme:', err);
@ -100,22 +163,31 @@ class AudioService {
}
this.setState('processing');
this.vadEnabled = false;
if (this.vadTimer) {
clearInterval(this.vadTimer);
this.vadTimer = null;
}
try {
await this.recorder.stopRecorder();
this.recorder.removeRecordBackListener();
const durationMs = Date.now() - this.recordingStartTime;
// In Produktion: Audiodaten vom nativen Recorder holen
// const audioData = await NativeAudioRecorder.stop();
const base64Placeholder = ''; // Platzhalter bis Native-Bridge implementiert
// Audio-Datei als Base64 lesen
const base64Data = await RNFS.readFile(this.recordingPath, 'base64');
// Temp-Datei aufraeumen
RNFS.unlink(this.recordingPath).catch(() => {});
this.setState('idle');
console.log(`[Audio] Aufnahme beendet (${durationMs}ms)`);
console.log(`[Audio] Aufnahme beendet (${durationMs}ms, ${Math.round(base64Data.length / 1024)}KB)`);
return {
base64: base64Placeholder,
base64: base64Data,
durationMs,
mimeType: AUDIO_ENCODING,
mimeType: 'audio/mp4', // AAC in MP4 Container
};
} catch (err) {
console.error('[Audio] Fehler beim Stoppen der Aufnahme:', err);
@ -134,7 +206,7 @@ class AudioService {
this.stopPlayback();
try {
// Base64 → temporaere WAV-Datei → Sound abspielen
// Base64 -> temporaere WAV-Datei -> Sound abspielen
const tmpPath = `${RNFS.CachesDirectoryPath}/aria_tts_${Date.now()}.wav`;
await RNFS.writeFile(tmpPath, base64Data, 'base64');
@ -152,7 +224,6 @@ class AudioService {
}
this.currentSound?.release();
this.currentSound = null;
// Temp-Datei aufraeumen
RNFS.unlink(tmpPath).catch(() => {});
});
});
@ -170,7 +241,7 @@ class AudioService {
}
}
// --- Status ---
// --- Status & Callbacks ---
getRecordingState(): RecordingState {
return this.recordingState;
@ -184,6 +255,22 @@ class AudioService {
};
}
/** Callback fuer Metering-Updates (dB Werte waehrend Aufnahme) */
onMeterUpdate(callback: MeterCallback): () => void {
this.meterListeners.push(callback);
return () => {
this.meterListeners = this.meterListeners.filter(cb => cb !== callback);
};
}
/** Callback wenn VAD Stille erkennt (Auto-Stop) */
onSilenceDetected(callback: SilenceCallback): () => void {
this.silenceListeners.push(callback);
return () => {
this.silenceListeners = this.silenceListeners.filter(cb => cb !== callback);
};
}
private setState(state: RecordingState): void {
if (this.recordingState !== state) {
this.recordingState = state;

View File

@ -0,0 +1,145 @@
/**
* Wake Word Service "ARIA" Erkennung
*
* Nutzt react-native-live-audio-stream fuer kontinuierliches Mikrofon-Monitoring.
* Erkennt Sprache per Energie-Schwellwert und sendet kurze Audio-Clips
* zur serverseitigen Wake-Word-Pruefung (openwakeword in der Bridge).
*
* Architektur:
* App (Mikrofon) Energie-Erkennung Audio-Buffer
* RVS "wake_check" Bridge openwakeword Bestaetigung
* App startet Aufnahme
*
* Aktuell (Phase 1): Einfacher Tap-to-Talk + Auto-Stop.
* Spaeter (Phase 2): Porcupine on-device "ARIA" Keyword.
*/
import LiveAudioStream from 'react-native-live-audio-stream';
type WakeWordCallback = () => void;
type StateCallback = (state: WakeWordState) => void;
export type WakeWordState = 'off' | 'listening' | 'detected';
class WakeWordService {
private state: WakeWordState = 'off';
private wakeCallbacks: WakeWordCallback[] = [];
private stateCallbacks: StateCallback[] = [];
private isInitialized = false;
/** Wake Word Erkennung starten */
async start(): Promise<boolean> {
if (this.state === 'listening') return true;
try {
if (!this.isInitialized) {
LiveAudioStream.init({
sampleRate: 16000,
channels: 1,
bitsPerSample: 16,
audioSource: 6, // VOICE_RECOGNITION
bufferSize: 4096,
});
this.isInitialized = true;
}
// Audio-Stream starten und auf Energie pruefen
LiveAudioStream.start();
LiveAudioStream.on('data', (base64Chunk: string) => {
if (this.state !== 'listening') return;
// Base64 → Int16 Array → RMS berechnen
const raw = this._base64ToInt16(base64Chunk);
const rms = this._calculateRMS(raw);
// Schwellwert: wenn laut genug → Wake Word erkannt
// Phase 1: Einfache Energie-Erkennung (jemand spricht)
// Phase 2: Porcupine "ARIA" Keyword
if (rms > 2000) {
this.setState('detected');
this.wakeCallbacks.forEach(cb => cb());
// Nach Detection kurz pausieren, Aufnahme uebernimmt das Mikrofon
this.stop();
}
});
this.setState('listening');
console.log('[WakeWord] Listening gestartet');
return true;
} catch (err) {
console.error('[WakeWord] Start fehlgeschlagen:', err);
return false;
}
}
/** Wake Word Erkennung stoppen */
stop(): void {
if (this.state === 'off') return;
try {
LiveAudioStream.stop();
} catch {}
this.setState('off');
console.log('[WakeWord] Gestoppt');
}
/** Nach Aufnahme erneut starten */
async resume(): Promise<void> {
// Kurze Pause damit Aufnahme das Mikrofon freigeben kann
setTimeout(() => {
if (this.state === 'off') {
this.start();
}
}, 500);
}
// --- Callbacks ---
onWakeWord(callback: WakeWordCallback): () => void {
this.wakeCallbacks.push(callback);
return () => {
this.wakeCallbacks = this.wakeCallbacks.filter(cb => cb !== callback);
};
}
onStateChange(callback: StateCallback): () => void {
this.stateCallbacks.push(callback);
return () => {
this.stateCallbacks = this.stateCallbacks.filter(cb => cb !== callback);
};
}
getState(): WakeWordState {
return this.state;
}
// --- Hilfsfunktionen ---
private setState(state: WakeWordState): void {
if (this.state !== state) {
this.state = state;
this.stateCallbacks.forEach(cb => cb(state));
}
}
private _base64ToInt16(base64: string): Int16Array {
const binary = atob(base64);
const bytes = new Uint8Array(binary.length);
for (let i = 0; i < binary.length; i++) {
bytes[i] = binary.charCodeAt(i);
}
return new Int16Array(bytes.buffer);
}
private _calculateRMS(samples: Int16Array): number {
if (samples.length === 0) return 0;
let sum = 0;
for (let i = 0; i < samples.length; i++) {
sum += samples[i] * samples[i];
}
return Math.sqrt(sum / samples.length);
}
}
const wakeWordService = new WakeWordService();
export default wakeWordService;

View File

@ -30,6 +30,7 @@ import wave
from pathlib import Path
from typing import Optional
import subprocess
import urllib.request
import numpy as np
import sounddevice as sd
@ -959,13 +960,78 @@ class ARIABridge:
await self.ws_core.send(raw_message)
elif msg_type == "audio":
# Audio von der App → STT → an aria-core
logger.info("[rvs] Audio empfangen — TODO: STT")
# Spaeter: Audio decodieren, durch Whisper jagen, Ergebnis an core
# Audio von der App → decodieren → STT → an aria-core
audio_b64 = payload.get("base64", "")
mime_type = payload.get("mimeType", "audio/mp4")
duration_ms = payload.get("durationMs", 0)
if not audio_b64:
logger.warning("[rvs] Audio ohne Daten empfangen")
return
logger.info("[rvs] Audio empfangen: %s, %dms, %dKB",
mime_type, duration_ms, len(audio_b64) // 1365)
asyncio.create_task(self._process_app_audio(audio_b64, mime_type))
else:
logger.debug("[rvs] Unbekannter Typ: %s", msg_type)
async def _process_app_audio(self, audio_b64: str, mime_type: str) -> None:
"""Decodiert App-Audio (Base64 AAC/MP4), konvertiert zu 16kHz PCM, STT, sendet an core."""
loop = asyncio.get_event_loop()
tmp_in = None
tmp_out = None
try:
# Base64 → temp-Datei
ext = ".mp4" if "mp4" in mime_type else ".wav" if "wav" in mime_type else ".ogg"
tmp_in = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
tmp_in.write(base64.b64decode(audio_b64))
tmp_in.close()
# FFmpeg: beliebiges Format → 16kHz mono PCM (raw float32)
tmp_out = tempfile.NamedTemporaryFile(suffix=".raw", delete=False)
tmp_out.close()
cmd = [
"ffmpeg", "-y", "-i", tmp_in.name,
"-ar", "16000", "-ac", "1", "-f", "f32le",
tmp_out.name,
]
result = await loop.run_in_executor(
None,
lambda: subprocess.run(cmd, capture_output=True, timeout=30),
)
if result.returncode != 0:
logger.error("[rvs] FFmpeg Fehler: %s", result.stderr.decode()[:200])
return
# PCM lesen → numpy float32
audio_data = np.fromfile(tmp_out.name, dtype=np.float32)
if len(audio_data) == 0:
logger.warning("[rvs] Leere Audio-Daten nach Konvertierung")
return
duration_s = len(audio_data) / 16000.0
logger.info("[rvs] Audio konvertiert: %.1fs, %d samples", duration_s, len(audio_data))
# STT
text = await loop.run_in_executor(None, self.stt_engine.transcribe, audio_data)
if text.strip():
logger.info("[rvs] STT Ergebnis: '%s'", text[:80])
await self.send_to_core(text, source="app-voice")
else:
logger.info("[rvs] Keine Sprache erkannt — ignoriert")
except Exception:
logger.exception("[rvs] Audio-Verarbeitung fehlgeschlagen")
finally:
# Temp-Dateien aufraeumen
for f in [tmp_in, tmp_out]:
if f:
try:
os.unlink(f.name)
except OSError:
pass
async def _send_to_rvs(self, message: dict) -> None:
"""Sendet eine Nachricht an die App (via RVS)."""
if self.ws_rvs is None or not self.ws_rvs.open:

View File

@ -87,7 +87,7 @@ services:
- RVS_TOKEN=${RVS_TOKEN:-}
restart: unless-stopped
# ─── Diagnostic (Selbstcheck-UI) ──────────────────────
# ─── Diagnostic (Selbstcheck-UI und Einstellungen) ────
diagnostic:
build: ./diagnostic
container_name: aria-diagnostic