Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 49089eee4b | |||
| e544992c9f | |||
| 97a1a3089a | |||
| 64f18e97a0 | |||
| 9cbea27455 | |||
| c8881f9e4d | |||
| 028e3b2240 | |||
| c042f27106 |
@@ -79,8 +79,8 @@ android {
|
|||||||
applicationId "com.ariacockpit"
|
applicationId "com.ariacockpit"
|
||||||
minSdkVersion rootProject.ext.minSdkVersion
|
minSdkVersion rootProject.ext.minSdkVersion
|
||||||
targetSdkVersion rootProject.ext.targetSdkVersion
|
targetSdkVersion rootProject.ext.targetSdkVersion
|
||||||
versionCode 408
|
versionCode 502
|
||||||
versionName "0.0.4.8"
|
versionName "0.0.5.2"
|
||||||
// Fallback fuer Libraries mit Product Flavors
|
// Fallback fuer Libraries mit Product Flavors
|
||||||
missingDimensionStrategy 'react-native-camera', 'general'
|
missingDimensionStrategy 'react-native-camera', 'general'
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "aria-cockpit",
|
"name": "aria-cockpit",
|
||||||
"version": "0.0.4.8",
|
"version": "0.0.5.2",
|
||||||
"private": true,
|
"private": true,
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"android": "react-native run-android",
|
"android": "react-native run-android",
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ import {
|
|||||||
Image,
|
Image,
|
||||||
ScrollView,
|
ScrollView,
|
||||||
Modal,
|
Modal,
|
||||||
|
ToastAndroid,
|
||||||
} from 'react-native';
|
} from 'react-native';
|
||||||
import AsyncStorage from '@react-native-async-storage/async-storage';
|
import AsyncStorage from '@react-native-async-storage/async-storage';
|
||||||
import RNFS from 'react-native-fs';
|
import RNFS from 'react-native-fs';
|
||||||
@@ -325,6 +326,26 @@ const ChatScreen: React.FC = () => {
|
|||||||
const tool = (message.payload.tool as string) || '';
|
const tool = (message.payload.tool as string) || '';
|
||||||
setAgentActivity({ activity, tool });
|
setAgentActivity({ activity, tool });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Voice-Config aus Diagnostic — setzt die lokale App-Stimme auf den
|
||||||
|
// gerade in Diagnostic gewaehlten Wert zurueck. User-Wahl in der App
|
||||||
|
// wird dadurch ueberschrieben.
|
||||||
|
if (message.type === ('config' as any)) {
|
||||||
|
const newVoice = ((message.payload as any).xttsVoice as string) ?? '';
|
||||||
|
localXttsVoiceRef.current = newVoice;
|
||||||
|
AsyncStorage.setItem('aria_xtts_voice', newVoice);
|
||||||
|
}
|
||||||
|
|
||||||
|
// XTTS-Bridge meldet Stimme fertig geladen (kurzer Status-Toast)
|
||||||
|
if (message.type === ('voice_ready' as any)) {
|
||||||
|
const v = ((message.payload as any).voice as string) ?? '';
|
||||||
|
const err = (message.payload as any).error as string | undefined;
|
||||||
|
if (err) {
|
||||||
|
ToastAndroid.show(`Stimme "${v}" Fehler: ${err}`, ToastAndroid.LONG);
|
||||||
|
} else {
|
||||||
|
ToastAndroid.show(`Stimme "${v || 'Standard'}" bereit`, ToastAndroid.SHORT);
|
||||||
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
const unsubState = rvs.onStateChange((state) => {
|
const unsubState = rvs.onStateChange((state) => {
|
||||||
|
|||||||
@@ -15,6 +15,8 @@ import {
|
|||||||
StyleSheet,
|
StyleSheet,
|
||||||
Alert,
|
Alert,
|
||||||
Platform,
|
Platform,
|
||||||
|
ToastAndroid,
|
||||||
|
ActivityIndicator,
|
||||||
} from 'react-native';
|
} from 'react-native';
|
||||||
import AsyncStorage from '@react-native-async-storage/async-storage';
|
import AsyncStorage from '@react-native-async-storage/async-storage';
|
||||||
import RNFS from 'react-native-fs';
|
import RNFS from 'react-native-fs';
|
||||||
@@ -82,6 +84,7 @@ const SettingsScreen: React.FC = () => {
|
|||||||
const [ttsPrerollSec, setTtsPrerollSec] = useState<number>(TTS_PREROLL_DEFAULT_SEC);
|
const [ttsPrerollSec, setTtsPrerollSec] = useState<number>(TTS_PREROLL_DEFAULT_SEC);
|
||||||
const [editingPath, setEditingPath] = useState(false);
|
const [editingPath, setEditingPath] = useState(false);
|
||||||
const [xttsVoice, setXttsVoice] = useState('');
|
const [xttsVoice, setXttsVoice] = useState('');
|
||||||
|
const [loadingVoice, setLoadingVoice] = useState<string | null>(null);
|
||||||
const [availableVoices, setAvailableVoices] = useState<Array<{name: string, size: number}>>([]);
|
const [availableVoices, setAvailableVoices] = useState<Array<{name: string, size: number}>>([]);
|
||||||
const [voiceCloneVisible, setVoiceCloneVisible] = useState(false);
|
const [voiceCloneVisible, setVoiceCloneVisible] = useState(false);
|
||||||
const [tempPath, setTempPath] = useState('');
|
const [tempPath, setTempPath] = useState('');
|
||||||
@@ -265,6 +268,31 @@ const SettingsScreen: React.FC = () => {
|
|||||||
}
|
}
|
||||||
rvs.send('xtts_list_voices' as any, {});
|
rvs.send('xtts_list_voices' as any, {});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Diagnostic-Voice-Wechsel → lokale App-Stimme auf den neuen Default zuruecksetzen.
|
||||||
|
// Zusaetzlich Preload triggern, damit der User weiss wann's geladen ist.
|
||||||
|
if (message.type === ('config' as any)) {
|
||||||
|
const newVoice = ((message.payload as any).xttsVoice as string) ?? '';
|
||||||
|
setXttsVoice(newVoice);
|
||||||
|
AsyncStorage.setItem('aria_xtts_voice', newVoice);
|
||||||
|
if (newVoice) {
|
||||||
|
setLoadingVoice(newVoice);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// XTTS-Bridge meldet: Stimme fertig geladen
|
||||||
|
if (message.type === ('voice_ready' as any)) {
|
||||||
|
const v = ((message.payload as any).voice as string) ?? '';
|
||||||
|
const err = (message.payload as any).error as string | undefined;
|
||||||
|
const ms = (message.payload as any).loadMs as number | undefined;
|
||||||
|
setLoadingVoice(null);
|
||||||
|
if (err) {
|
||||||
|
ToastAndroid.show(`Stimme "${v}" konnte nicht geladen werden: ${err}`, ToastAndroid.LONG);
|
||||||
|
} else {
|
||||||
|
const suffix = ms ? ` (${(ms / 1000).toFixed(1)}s)` : '';
|
||||||
|
ToastAndroid.show(`Stimme "${v || 'Standard'}" bereit${suffix}`, ToastAndroid.SHORT);
|
||||||
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
return () => {
|
return () => {
|
||||||
@@ -333,6 +361,13 @@ const SettingsScreen: React.FC = () => {
|
|||||||
const selectVoice = useCallback((voiceName: string) => {
|
const selectVoice = useCallback((voiceName: string) => {
|
||||||
setXttsVoice(voiceName);
|
setXttsVoice(voiceName);
|
||||||
AsyncStorage.setItem('aria_xtts_voice', voiceName);
|
AsyncStorage.setItem('aria_xtts_voice', voiceName);
|
||||||
|
// Preload nur fuer Custom-Voices — "Standard" braucht keinen Ladevorgang
|
||||||
|
if (voiceName) {
|
||||||
|
setLoadingVoice(voiceName);
|
||||||
|
rvs.send('voice_preload' as any, { voice: voiceName, source: 'app' });
|
||||||
|
} else {
|
||||||
|
setLoadingVoice(null);
|
||||||
|
}
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
const deleteVoice = useCallback((name: string) => {
|
const deleteVoice = useCallback((name: string) => {
|
||||||
@@ -612,7 +647,10 @@ const SettingsScreen: React.FC = () => {
|
|||||||
</Text>
|
</Text>
|
||||||
<Text style={styles.voiceRowMeta}>{(v.size / 1024).toFixed(0)} KB</Text>
|
<Text style={styles.voiceRowMeta}>{(v.size / 1024).toFixed(0)} KB</Text>
|
||||||
</TouchableOpacity>
|
</TouchableOpacity>
|
||||||
{xttsVoice === v.name && <Text style={styles.voiceRowCheck}>{'\u2713'}</Text>}
|
{loadingVoice === v.name && (
|
||||||
|
<ActivityIndicator size="small" color="#0096FF" style={{marginRight: 8}} />
|
||||||
|
)}
|
||||||
|
{xttsVoice === v.name && loadingVoice !== v.name && <Text style={styles.voiceRowCheck}>{'\u2713'}</Text>}
|
||||||
<TouchableOpacity onPress={() => deleteVoice(v.name)} style={styles.voiceRowDelete}>
|
<TouchableOpacity onPress={() => deleteVoice(v.name)} style={styles.voiceRowDelete}>
|
||||||
<Text style={styles.voiceRowDeleteIcon}>X</Text>
|
<Text style={styles.voiceRowDeleteIcon}>X</Text>
|
||||||
</TouchableOpacity>
|
</TouchableOpacity>
|
||||||
|
|||||||
+131
-43
@@ -257,6 +257,12 @@ def clean_text_for_tts(text: str) -> str:
|
|||||||
for pat, repl in _UNIT_WORDS:
|
for pat, repl in _UNIT_WORDS:
|
||||||
t = _re_tts.sub(pat, repl, t)
|
t = _re_tts.sub(pat, repl, t)
|
||||||
|
|
||||||
|
# Generisches Buchstabieren: alle verbleibenden 2-5-Zeichen-Grossbuchstaben-Woerter
|
||||||
|
# (XTTS, USB, DNS, JSON, HTML, ...) → "X T T S". Laeuft NACH der expliziten Liste,
|
||||||
|
# damit TTS/GPU/... schon aufgeloest sind. "WLAN"-artige, die als Wort gesprochen
|
||||||
|
# werden, koennen bei Bedarf explizit in _UNIT_WORDS uebersteuert werden.
|
||||||
|
t = _re_tts.sub(r'\b([A-Z]{2,5})\b', lambda m: " ".join(m.group(1)), t)
|
||||||
|
|
||||||
# Anfuehrungszeichen
|
# Anfuehrungszeichen
|
||||||
t = _re_tts.sub(r'["""„`]', '', t)
|
t = _re_tts.sub(r'["""„`]', '', t)
|
||||||
|
|
||||||
@@ -319,8 +325,16 @@ class STTEngine:
|
|||||||
Erkannter Text oder leerer String.
|
Erkannter Text oder leerer String.
|
||||||
"""
|
"""
|
||||||
if self.model is None:
|
if self.model is None:
|
||||||
logger.error("Whisper-Modell nicht initialisiert")
|
# Lazy-Load: normalerweise laeuft STT remote auf der Gamebox.
|
||||||
return ""
|
# Erst wenn das Fallback hier zuschlaegt, laden wir lokal.
|
||||||
|
logger.info("Lokales Whisper-Fallback — Modell wird nachgeladen...")
|
||||||
|
try:
|
||||||
|
self.initialize()
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Lokales Whisper konnte nicht geladen werden")
|
||||||
|
return ""
|
||||||
|
if self.model is None:
|
||||||
|
return ""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Audio als float32 normalisieren
|
# Audio als float32 normalisieren
|
||||||
@@ -517,6 +531,9 @@ class ARIABridge:
|
|||||||
# Wird fuer die direkt folgende ARIA-Antwort genutzt und dann zurueckgesetzt.
|
# Wird fuer die direkt folgende ARIA-Antwort genutzt und dann zurueckgesetzt.
|
||||||
# So kann jedes Geraet seine bevorzugte Stimme bekommen (pro Request).
|
# So kann jedes Geraet seine bevorzugte Stimme bekommen (pro Request).
|
||||||
self._next_voice_override: Optional[str] = None
|
self._next_voice_override: Optional[str] = None
|
||||||
|
# STT-Requests die aktuell auf Antwort von der whisper-bridge (Gamebox) warten.
|
||||||
|
# requestId → Future mit dem Text (oder None bei Fehler).
|
||||||
|
self._pending_stt: dict[str, asyncio.Future] = {}
|
||||||
|
|
||||||
def initialize(self) -> None:
|
def initialize(self) -> None:
|
||||||
"""Initialisiert alle Komponenten.
|
"""Initialisiert alle Komponenten.
|
||||||
@@ -529,8 +546,9 @@ class ARIABridge:
|
|||||||
logger.info("ARIA Voice Bridge startet...")
|
logger.info("ARIA Voice Bridge startet...")
|
||||||
logger.info("=" * 50)
|
logger.info("=" * 50)
|
||||||
|
|
||||||
# STT IMMER laden — verarbeitet Audio von der App (braucht kein Sounddevice)
|
# STT wird standardmaessig von der whisper-bridge (Gamebox) erledigt.
|
||||||
self.stt_engine.initialize()
|
# Lokales Whisper ist nur Fallback und wird lazy geladen wenn remote nicht
|
||||||
|
# antwortet. Das spart RAM auf der VM und Startup-Zeit.
|
||||||
|
|
||||||
# Audio-Hardware pruefen (fuer lokales Mikro/Lautsprecher)
|
# Audio-Hardware pruefen (fuer lokales Mikro/Lautsprecher)
|
||||||
self.audio_available = False
|
self.audio_available = False
|
||||||
@@ -1189,11 +1207,16 @@ class ARIABridge:
|
|||||||
changed = True
|
changed = True
|
||||||
if "whisperModel" in payload:
|
if "whisperModel" in payload:
|
||||||
new_model = payload["whisperModel"]
|
new_model = payload["whisperModel"]
|
||||||
if new_model and new_model != self.stt_engine.model_size:
|
allowed = {"tiny", "base", "small", "medium", "large-v3"}
|
||||||
logger.info("[rvs] Whisper-Modell Wechsel: %s -> %s (laedt...)", self.stt_engine.model_size, new_model)
|
if new_model in allowed and new_model != self.stt_engine.model_size:
|
||||||
loop = asyncio.get_event_loop()
|
# Merken und mitschicken an whisper-bridge (Gamebox).
|
||||||
if await loop.run_in_executor(None, self.stt_engine.reload, new_model):
|
# Lokales Modell wird NICHT geladen — nur das Fallback braucht's,
|
||||||
changed = True
|
# und das passiert erst on-demand wenn Remote nicht antwortet.
|
||||||
|
logger.info("[rvs] Whisper-Modell → %s (nur Config; Modell laedt Gamebox)",
|
||||||
|
new_model)
|
||||||
|
self.stt_engine.model_size = new_model
|
||||||
|
self.stt_engine.model = None
|
||||||
|
changed = True
|
||||||
# Persistent speichern in Shared Volume
|
# Persistent speichern in Shared Volume
|
||||||
if changed:
|
if changed:
|
||||||
try:
|
try:
|
||||||
@@ -1353,22 +1376,111 @@ class ARIABridge:
|
|||||||
mime_type, duration_ms, len(audio_b64) // 1365)
|
mime_type, duration_ms, len(audio_b64) // 1365)
|
||||||
asyncio.create_task(self._process_app_audio(audio_b64, mime_type))
|
asyncio.create_task(self._process_app_audio(audio_b64, mime_type))
|
||||||
|
|
||||||
|
elif msg_type == "stt_response":
|
||||||
|
# Antwort der whisper-bridge auf unseren stt_request
|
||||||
|
request_id = payload.get("requestId", "")
|
||||||
|
future = self._pending_stt.get(request_id)
|
||||||
|
if future is None or future.done():
|
||||||
|
return
|
||||||
|
error = payload.get("error", "")
|
||||||
|
if error:
|
||||||
|
logger.warning("[rvs] stt_response Fehler: %s", error)
|
||||||
|
future.set_result(None)
|
||||||
|
else:
|
||||||
|
text = payload.get("text", "")
|
||||||
|
stt_ms = payload.get("sttMs", 0)
|
||||||
|
model = payload.get("model", "?")
|
||||||
|
logger.info("[rvs] Remote-STT OK (%s, %dms): '%s'", model, stt_ms, (text or "")[:80])
|
||||||
|
future.set_result(text)
|
||||||
|
return
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logger.debug("[rvs] Unbekannter Typ: %s", msg_type)
|
logger.debug("[rvs] Unbekannter Typ: %s", msg_type)
|
||||||
|
|
||||||
|
# STT-Orchestrierung: zuerst Remote (Gamebox), Fallback lokal.
|
||||||
|
# Timeout grosszuegig gewaehlt, damit auch ein erstmaliger Modell-Load
|
||||||
|
# auf der Gamebox (bis ~30s bei large-v3) durchgeht.
|
||||||
|
_STT_REMOTE_TIMEOUT_S = 45.0
|
||||||
|
|
||||||
async def _process_app_audio(self, audio_b64: str, mime_type: str) -> None:
|
async def _process_app_audio(self, audio_b64: str, mime_type: str) -> None:
|
||||||
"""Decodiert App-Audio (Base64 AAC/MP4), konvertiert zu 16kHz PCM, STT, sendet an core."""
|
"""App-Audio → STT → aria-core. Primaer via whisper-bridge (RVS), Fallback lokal."""
|
||||||
|
# Erst Remote versuchen
|
||||||
|
text = await self._stt_remote(audio_b64, mime_type)
|
||||||
|
if text is None:
|
||||||
|
# Remote hat nicht geantwortet → lokales Whisper
|
||||||
|
logger.warning("[rvs] Remote-STT nicht verfuegbar — Fallback auf lokales Whisper")
|
||||||
|
text = await self._stt_local(audio_b64, mime_type)
|
||||||
|
if text is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
if text.strip():
|
||||||
|
logger.info("[rvs] STT Ergebnis: '%s'", text[:80])
|
||||||
|
# ERST an aria-core senden (wichtigster Schritt)
|
||||||
|
await self.send_to_core(text, source="app-voice")
|
||||||
|
# STT-Text an RVS senden (fuer Anzeige in App + Diagnostic)
|
||||||
|
# sender="stt" damit Bridge es ignoriert (kein Loop)
|
||||||
|
try:
|
||||||
|
await self._send_to_rvs({
|
||||||
|
"type": "chat",
|
||||||
|
"payload": {
|
||||||
|
"text": text,
|
||||||
|
"sender": "stt",
|
||||||
|
},
|
||||||
|
"timestamp": int(asyncio.get_event_loop().time() * 1000),
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("[rvs] STT-Text konnte nicht an RVS gesendet werden: %s", e)
|
||||||
|
else:
|
||||||
|
logger.info("[rvs] Keine Sprache erkannt — ignoriert")
|
||||||
|
|
||||||
|
async def _stt_remote(self, audio_b64: str, mime_type: str) -> Optional[str]:
|
||||||
|
"""Schickt Audio an die whisper-bridge und wartet auf stt_response.
|
||||||
|
|
||||||
|
Rueckgabe:
|
||||||
|
str — erkannter Text (kann leer sein)
|
||||||
|
None — Remote-STT nicht erreichbar oder Fehler/Timeout (→ Fallback)
|
||||||
|
"""
|
||||||
|
if self.ws_rvs is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
request_id = str(uuid.uuid4())
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
future: asyncio.Future = loop.create_future()
|
||||||
|
self._pending_stt[request_id] = future
|
||||||
|
|
||||||
|
try:
|
||||||
|
await self._send_to_rvs({
|
||||||
|
"type": "stt_request",
|
||||||
|
"payload": {
|
||||||
|
"requestId": request_id,
|
||||||
|
"audio": audio_b64,
|
||||||
|
"mimeType": mime_type,
|
||||||
|
"model": getattr(self.stt_engine, "model_size", "small"),
|
||||||
|
"language": getattr(self.stt_engine, "language", "de"),
|
||||||
|
},
|
||||||
|
"timestamp": int(loop.time() * 1000),
|
||||||
|
})
|
||||||
|
return await asyncio.wait_for(future, timeout=self._STT_REMOTE_TIMEOUT_S)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("[rvs] Remote-STT Timeout (%.0fs)", self._STT_REMOTE_TIMEOUT_S)
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("[rvs] Remote-STT Fehler: %s", e)
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
self._pending_stt.pop(request_id, None)
|
||||||
|
|
||||||
|
async def _stt_local(self, audio_b64: str, mime_type: str) -> Optional[str]:
|
||||||
|
"""Lokales Whisper-Fallback: FFmpeg → float32 → stt_engine.transcribe."""
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
tmp_in = None
|
tmp_in = None
|
||||||
tmp_out = None
|
tmp_out = None
|
||||||
try:
|
try:
|
||||||
# Base64 → temp-Datei
|
|
||||||
ext = ".mp4" if "mp4" in mime_type else ".wav" if "wav" in mime_type else ".ogg"
|
ext = ".mp4" if "mp4" in mime_type else ".wav" if "wav" in mime_type else ".ogg"
|
||||||
tmp_in = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
|
tmp_in = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
|
||||||
tmp_in.write(base64.b64decode(audio_b64))
|
tmp_in.write(base64.b64decode(audio_b64))
|
||||||
tmp_in.close()
|
tmp_in.close()
|
||||||
|
|
||||||
# FFmpeg: beliebiges Format → 16kHz mono PCM (raw float32)
|
|
||||||
tmp_out = tempfile.NamedTemporaryFile(suffix=".raw", delete=False)
|
tmp_out = tempfile.NamedTemporaryFile(suffix=".raw", delete=False)
|
||||||
tmp_out.close()
|
tmp_out.close()
|
||||||
|
|
||||||
@@ -1383,45 +1495,21 @@ class ARIABridge:
|
|||||||
)
|
)
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
logger.error("[rvs] FFmpeg Fehler: %s", result.stderr.decode()[:200])
|
logger.error("[rvs] FFmpeg Fehler: %s", result.stderr.decode()[:200])
|
||||||
return
|
return None
|
||||||
|
|
||||||
# PCM lesen → numpy float32
|
|
||||||
audio_data = np.fromfile(tmp_out.name, dtype=np.float32)
|
audio_data = np.fromfile(tmp_out.name, dtype=np.float32)
|
||||||
if len(audio_data) == 0:
|
if len(audio_data) == 0:
|
||||||
logger.warning("[rvs] Leere Audio-Daten nach Konvertierung")
|
logger.warning("[rvs] Leere Audio-Daten nach Konvertierung")
|
||||||
return
|
return None
|
||||||
|
|
||||||
duration_s = len(audio_data) / 16000.0
|
duration_s = len(audio_data) / 16000.0
|
||||||
logger.info("[rvs] Audio konvertiert: %.1fs, %d samples", duration_s, len(audio_data))
|
logger.info("[rvs] Lokal-STT: %.1fs Audio, model=%s", duration_s, self.stt_engine.model_size)
|
||||||
|
return await loop.run_in_executor(None, self.stt_engine.transcribe, audio_data)
|
||||||
# STT
|
|
||||||
text = await loop.run_in_executor(None, self.stt_engine.transcribe, audio_data)
|
|
||||||
|
|
||||||
if text.strip():
|
|
||||||
logger.info("[rvs] STT Ergebnis: '%s'", text[:80])
|
|
||||||
# ERST an aria-core senden (wichtigster Schritt)
|
|
||||||
await self.send_to_core(text, source="app-voice")
|
|
||||||
# STT-Text an RVS senden (fuer Anzeige in App + Diagnostic)
|
|
||||||
# sender="stt" damit Bridge es ignoriert (kein Loop)
|
|
||||||
try:
|
|
||||||
await self._send_to_rvs({
|
|
||||||
"type": "chat",
|
|
||||||
"payload": {
|
|
||||||
"text": text,
|
|
||||||
"sender": "stt",
|
|
||||||
},
|
|
||||||
"timestamp": int(asyncio.get_event_loop().time() * 1000),
|
|
||||||
})
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("[rvs] STT-Text konnte nicht an RVS gesendet werden: %s", e)
|
|
||||||
else:
|
|
||||||
logger.info("[rvs] Keine Sprache erkannt — ignoriert")
|
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("[rvs] Audio-Verarbeitung fehlgeschlagen")
|
logger.exception("[rvs] Lokales STT fehlgeschlagen")
|
||||||
|
return None
|
||||||
finally:
|
finally:
|
||||||
# Temp-Dateien aufraeumen
|
for f in (tmp_in, tmp_out):
|
||||||
for f in [tmp_in, tmp_out]:
|
|
||||||
if f:
|
if f:
|
||||||
try:
|
try:
|
||||||
os.unlink(f.name)
|
os.unlink(f.name)
|
||||||
|
|||||||
+26
-1
@@ -438,13 +438,14 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- XTTS Stimme -->
|
<!-- XTTS Stimme -->
|
||||||
<div style="display:flex;align-items:center;gap:12px;margin-bottom:12px;">
|
<div style="display:flex;align-items:center;gap:12px;margin-bottom:6px;">
|
||||||
<label style="color:#8888AA;font-size:12px;">XTTS Stimme:</label>
|
<label style="color:#8888AA;font-size:12px;">XTTS Stimme:</label>
|
||||||
<select id="diag-xtts-voice" onchange="sendVoiceConfig()" style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;">
|
<select id="diag-xtts-voice" onchange="sendVoiceConfig()" style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;">
|
||||||
<option value="">Standard (XTTS Default)</option>
|
<option value="">Standard (XTTS Default)</option>
|
||||||
</select>
|
</select>
|
||||||
<button class="btn secondary" onclick="loadXTTSVoices()" style="padding:4px 10px;font-size:11px;">Laden</button>
|
<button class="btn secondary" onclick="loadXTTSVoices()" style="padding:4px 10px;font-size:11px;">Laden</button>
|
||||||
</div>
|
</div>
|
||||||
|
<div id="voice-status" style="font-size:11px;min-height:14px;margin-bottom:12px;color:#8888AA;"></div>
|
||||||
|
|
||||||
<!-- Gecloned Stimmen — Liste mit Loeschen -->
|
<!-- Gecloned Stimmen — Liste mit Loeschen -->
|
||||||
<div id="xtts-voice-list" style="margin-bottom:12px;"></div>
|
<div id="xtts-voice-list" style="margin-bottom:12px;"></div>
|
||||||
@@ -851,6 +852,25 @@
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (msg.type === 'voice_ready') {
|
||||||
|
const v = msg.payload?.voice || '';
|
||||||
|
const err = msg.payload?.error;
|
||||||
|
const ms = msg.payload?.loadMs;
|
||||||
|
const statusEl = document.getElementById('voice-status');
|
||||||
|
if (statusEl) {
|
||||||
|
if (err) {
|
||||||
|
statusEl.textContent = `⚠️ Stimme "${v}" Fehler: ${err}`;
|
||||||
|
statusEl.style.color = '#FF3B30';
|
||||||
|
} else {
|
||||||
|
statusEl.textContent = `✅ Stimme "${v || 'Standard'}" bereit${ms ? ` (${(ms/1000).toFixed(1)}s)` : ''}`;
|
||||||
|
statusEl.style.color = '#34C759';
|
||||||
|
}
|
||||||
|
setTimeout(() => { if (statusEl) statusEl.textContent = ''; }, 5000);
|
||||||
|
}
|
||||||
|
addLog('info', 'xtts', err ? `Voice "${v}": ${err}` : `Voice "${v || 'Standard'}" bereit`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (msg.type === 'watchdog') {
|
if (msg.type === 'watchdog') {
|
||||||
const colors = { warning: '#FFD60A', fixing: '#FF9500', fixed: '#34C759', error: '#FF3B30' };
|
const colors = { warning: '#FFD60A', fixing: '#FF9500', fixed: '#34C759', error: '#FF3B30' };
|
||||||
const color = colors[msg.status] || '#FFD60A';
|
const color = colors[msg.status] || '#FFD60A';
|
||||||
@@ -1551,6 +1571,11 @@
|
|||||||
const xttsVoice = document.getElementById('diag-xtts-voice').value;
|
const xttsVoice = document.getElementById('diag-xtts-voice').value;
|
||||||
const whisperModel = document.getElementById('diag-whisper-model').value;
|
const whisperModel = document.getElementById('diag-whisper-model').value;
|
||||||
send({ action: 'send_voice_config', ttsEnabled, xttsVoice, whisperModel });
|
send({ action: 'send_voice_config', ttsEnabled, xttsVoice, whisperModel });
|
||||||
|
const statusEl = document.getElementById('voice-status');
|
||||||
|
if (statusEl && xttsVoice) {
|
||||||
|
statusEl.textContent = `⏳ Stimme "${xttsVoice}" wird geladen...`;
|
||||||
|
statusEl.style.color = '#FFD60A';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Passwort-Feld Anzeigen/Verbergen ─────────────────────
|
// ── Passwort-Feld Anzeigen/Verbergen ─────────────────────
|
||||||
|
|||||||
@@ -626,6 +626,17 @@ function connectRVS(forcePlain) {
|
|||||||
// Mode-Broadcast von der Bridge → an Browser-Clients weiterreichen
|
// Mode-Broadcast von der Bridge → an Browser-Clients weiterreichen
|
||||||
log("info", "rvs", `Mode-Broadcast: ${msg.payload?.mode} (${msg.payload?.name})`);
|
log("info", "rvs", `Mode-Broadcast: ${msg.payload?.mode} (${msg.payload?.name})`);
|
||||||
broadcast({ type: "mode", payload: msg.payload });
|
broadcast({ type: "mode", payload: msg.payload });
|
||||||
|
} else if (msg.type === "voice_ready") {
|
||||||
|
// XTTS-Bridge meldet Stimme fertig geladen → an Browser durchreichen
|
||||||
|
const v = msg.payload?.voice || "";
|
||||||
|
const err = msg.payload?.error;
|
||||||
|
const ms = msg.payload?.loadMs;
|
||||||
|
if (err) {
|
||||||
|
log("warn", "rvs", `Voice-Ready Fehler fuer "${v}": ${err}`);
|
||||||
|
} else {
|
||||||
|
log("info", "rvs", `Voice "${v || "default"}" geladen${ms ? ` in ${(ms/1000).toFixed(1)}s` : ""}`);
|
||||||
|
}
|
||||||
|
broadcast({ type: "voice_ready", payload: msg.payload });
|
||||||
} else {
|
} else {
|
||||||
log("debug", "rvs", `Nachricht: ${JSON.stringify(msg).slice(0, 150)}`);
|
log("debug", "rvs", `Nachricht: ${JSON.stringify(msg).slice(0, 150)}`);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ const ALLOWED_TYPES = new Set([
|
|||||||
"agent_activity", "cancel_request",
|
"agent_activity", "cancel_request",
|
||||||
"audio_pcm",
|
"audio_pcm",
|
||||||
"xtts_delete_voice",
|
"xtts_delete_voice",
|
||||||
|
"voice_preload", "voice_ready",
|
||||||
|
"stt_request", "stt_response",
|
||||||
]);
|
]);
|
||||||
|
|
||||||
// Token-Raum: token -> { clients: Set<ws> }
|
// Token-Raum: token -> { clients: Set<ws> }
|
||||||
|
|||||||
+87
-4
@@ -69,6 +69,18 @@ function connectRVS(forcePlain) {
|
|||||||
await handleListVoices();
|
await handleListVoices();
|
||||||
} else if (msg.type === "xtts_delete_voice") {
|
} else if (msg.type === "xtts_delete_voice") {
|
||||||
await handleDeleteVoice(msg.payload);
|
await handleDeleteVoice(msg.payload);
|
||||||
|
} else if (msg.type === "voice_preload") {
|
||||||
|
await handleVoicePreload(msg.payload);
|
||||||
|
} else if (msg.type === "config") {
|
||||||
|
// Diagnostic hat globale Voice gewechselt → Preload damit der naechste
|
||||||
|
// Render ohne Ladewartezeit startet + alle Clients "voice_ready" sehen
|
||||||
|
const v = msg.payload && msg.payload.xttsVoice;
|
||||||
|
if (v && v !== lastDiagnosticVoice) {
|
||||||
|
lastDiagnosticVoice = v;
|
||||||
|
await handleVoicePreload({ voice: v, source: "diagnostic" });
|
||||||
|
} else if (!v) {
|
||||||
|
lastDiagnosticVoice = "";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
log(`Fehler: ${err.message}`);
|
log(`Fehler: ${err.message}`);
|
||||||
@@ -120,6 +132,10 @@ function applyFadeIn(base64Pcm, sampleRate, channels, fadeMs) {
|
|||||||
// interleaved PCM-Chunks aus zwei Rendern → klingt wie Chaos.
|
// interleaved PCM-Chunks aus zwei Rendern → klingt wie Chaos.
|
||||||
let ttsQueue = Promise.resolve();
|
let ttsQueue = Promise.resolve();
|
||||||
|
|
||||||
|
// Merkt sich die letzte in Diagnostic gewaehlte Voice, damit wir nicht bei jedem
|
||||||
|
// config-Broadcast (auch ohne Aenderung) einen Preload triggern.
|
||||||
|
let lastDiagnosticVoice = "";
|
||||||
|
|
||||||
function handleTTSRequest(payload) {
|
function handleTTSRequest(payload) {
|
||||||
ttsQueue = ttsQueue.then(() => _runTTSRequest(payload)).catch(err => {
|
ttsQueue = ttsQueue.then(() => _runTTSRequest(payload)).catch(err => {
|
||||||
log(`TTS-Queue Fehler: ${err.message}`);
|
log(`TTS-Queue Fehler: ${err.message}`);
|
||||||
@@ -151,8 +167,18 @@ async function _runTTSRequest(payload) {
|
|||||||
log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);
|
log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
// Im local-Mode erwartet daswer123 XTTS speaker_wav als Basename (ohne .wav,
|
||||||
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
// ohne Pfad) — der Server prefixt EXAMPLE_FOLDER selbst. Wir checken hier
|
||||||
|
// nur das physische File ab um Warnungen zu loggen; runter ans API geht
|
||||||
|
// nur der Name.
|
||||||
|
const voiceFilePath = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
||||||
|
const hasCustomVoice = voiceFilePath && fs.existsSync(voiceFilePath);
|
||||||
|
const speakerName = hasCustomVoice ? voice : "";
|
||||||
|
if (voice && !hasCustomVoice) {
|
||||||
|
log(`WARNUNG: Voice "${voice}" angefordert, aber ${voiceFilePath} existiert nicht — nehme Default`);
|
||||||
|
} else if (hasCustomVoice) {
|
||||||
|
log(`Voice "${voice}" verwendet (speaker_wav="${speakerName}")`);
|
||||||
|
}
|
||||||
|
|
||||||
let chunkIndex = 0;
|
let chunkIndex = 0;
|
||||||
let pcmMeta = null;
|
let pcmMeta = null;
|
||||||
@@ -190,7 +216,7 @@ async function _runTTSRequest(payload) {
|
|||||||
await streamXTTSAsPCM(
|
await streamXTTSAsPCM(
|
||||||
cleanText,
|
cleanText,
|
||||||
language || "de",
|
language || "de",
|
||||||
hasCustomVoice ? voiceSample : null,
|
speakerName,
|
||||||
onChunk,
|
onChunk,
|
||||||
);
|
);
|
||||||
} catch (streamErr) {
|
} catch (streamErr) {
|
||||||
@@ -198,7 +224,7 @@ async function _runTTSRequest(payload) {
|
|||||||
await streamXTTSBatch(
|
await streamXTTSBatch(
|
||||||
cleanText,
|
cleanText,
|
||||||
language || "de",
|
language || "de",
|
||||||
hasCustomVoice ? voiceSample : null,
|
speakerName,
|
||||||
onChunk,
|
onChunk,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -460,6 +486,63 @@ async function handleDeleteVoice(payload) {
|
|||||||
|
|
||||||
// ── Voice List Handler ──────────────────────────────
|
// ── Voice List Handler ──────────────────────────────
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Preload einer Stimme — rendert stumm ein kurzes Dummy-Audio, damit XTTS
|
||||||
|
* die Speaker-Latents laedt und der naechste echte Request ohne Wartezeit
|
||||||
|
* loslegen kann. Broadcastet "voice_ready" wenn fertig (oder mit error).
|
||||||
|
*/
|
||||||
|
async function handleVoicePreload(payload) {
|
||||||
|
const voice = (payload && payload.voice) || "";
|
||||||
|
const source = (payload && payload.source) || "unknown";
|
||||||
|
const requestId = (payload && payload.requestId) || "";
|
||||||
|
log(`Voice-Preload angefordert: "${voice}" (source=${source})`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
let speakerName = "";
|
||||||
|
if (voice) {
|
||||||
|
const voiceFilePath = path.join(VOICES_DIR, `${voice}.wav`);
|
||||||
|
if (!fs.existsSync(voiceFilePath)) {
|
||||||
|
sendToRVS({
|
||||||
|
type: "voice_ready",
|
||||||
|
payload: { voice, requestId, error: "voice-file-not-found" },
|
||||||
|
timestamp: Date.now(),
|
||||||
|
});
|
||||||
|
log(`Preload abgebrochen: ${voiceFilePath} existiert nicht`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
speakerName = voice;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dummy-Request via Queue — damit sich Preload nicht mit echtem TTS ueberholt.
|
||||||
|
const t0 = Date.now();
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
ttsQueue = ttsQueue.then(async () => {
|
||||||
|
try {
|
||||||
|
await streamXTTSAsPCM("ja.", "de", speakerName, () => {});
|
||||||
|
resolve();
|
||||||
|
} catch (err) {
|
||||||
|
reject(err);
|
||||||
|
}
|
||||||
|
}).catch(reject);
|
||||||
|
});
|
||||||
|
const ms = Date.now() - t0;
|
||||||
|
log(`Voice "${voice || "default"}" geladen in ${ms}ms`);
|
||||||
|
|
||||||
|
sendToRVS({
|
||||||
|
type: "voice_ready",
|
||||||
|
payload: { voice, requestId, loadMs: ms },
|
||||||
|
timestamp: Date.now(),
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
log(`Voice-Preload Fehler: ${err.message}`);
|
||||||
|
sendToRVS({
|
||||||
|
type: "voice_ready",
|
||||||
|
payload: { voice, requestId, error: err.message.slice(0, 200) },
|
||||||
|
timestamp: Date.now(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function handleListVoices() {
|
async function handleListVoices() {
|
||||||
try {
|
try {
|
||||||
const files = fs.existsSync(VOICES_DIR)
|
const files = fs.existsSync(VOICES_DIR)
|
||||||
|
|||||||
@@ -58,5 +58,37 @@ services:
|
|||||||
- RVS_TOKEN=${RVS_TOKEN}
|
- RVS_TOKEN=${RVS_TOKEN}
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# ─── Whisper STT (GPU) ────────────────────────
|
||||||
|
# Faster-Whisper auf der Gamebox statt auf der VM (CPU) —
|
||||||
|
# deutlich schneller. Verbindet sich selbst per WebSocket an
|
||||||
|
# den RVS und nimmt dort stt_request Nachrichten der aria-bridge
|
||||||
|
# entgegen, antwortet mit stt_response. Laedt das Modell beim
|
||||||
|
# Start vor; auf Config-Broadcasts (Diagnostic → whisperModel)
|
||||||
|
# wird zur Laufzeit hot-swapped.
|
||||||
|
whisper-bridge:
|
||||||
|
build: ./whisper
|
||||||
|
container_name: aria-whisper-bridge
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: 1
|
||||||
|
capabilities: [gpu]
|
||||||
|
environment:
|
||||||
|
- RVS_HOST=${RVS_HOST}
|
||||||
|
- RVS_PORT=${RVS_PORT:-443}
|
||||||
|
- RVS_TLS=${RVS_TLS:-true}
|
||||||
|
- RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true}
|
||||||
|
- RVS_TOKEN=${RVS_TOKEN}
|
||||||
|
- WHISPER_MODEL=${WHISPER_MODEL:-small}
|
||||||
|
- WHISPER_DEVICE=${WHISPER_DEVICE:-cuda}
|
||||||
|
- WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
|
||||||
|
- WHISPER_LANGUAGE=${WHISPER_LANGUAGE:-de}
|
||||||
|
volumes:
|
||||||
|
- whisper-models:/root/.cache/huggingface # Model-Cache persistieren
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
xtts-models:
|
xtts-models:
|
||||||
|
whisper-models:
|
||||||
|
|||||||
@@ -0,0 +1,14 @@
|
|||||||
|
FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3 python3-pip ffmpeg \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY bridge.py .
|
||||||
|
|
||||||
|
CMD ["python3", "bridge.py"]
|
||||||
@@ -0,0 +1,247 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
ARIA Whisper Bridge — laeuft auf der Gamebox (RTX 3060).
|
||||||
|
|
||||||
|
Empfaengt stt_request via RVS → FFmpeg-Konvertierung → faster-whisper auf GPU
|
||||||
|
→ sendet stt_response zurueck an die aria-bridge.
|
||||||
|
|
||||||
|
Env:
|
||||||
|
RVS_HOST, RVS_PORT, RVS_TLS, RVS_TLS_FALLBACK, RVS_TOKEN
|
||||||
|
WHISPER_MODEL Default: small
|
||||||
|
WHISPER_DEVICE Default: cuda
|
||||||
|
WHISPER_COMPUTE_TYPE Default: float16
|
||||||
|
WHISPER_LANGUAGE Default: de
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import websockets
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
|
datefmt="%H:%M:%S",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger("whisper-bridge")
|
||||||
|
|
||||||
|
RVS_HOST = os.getenv("RVS_HOST", "").strip()
|
||||||
|
RVS_PORT = int(os.getenv("RVS_PORT", "443"))
|
||||||
|
RVS_TLS = os.getenv("RVS_TLS", "true").lower() == "true"
|
||||||
|
RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true").lower() == "true"
|
||||||
|
RVS_TOKEN = os.getenv("RVS_TOKEN", "").strip()
|
||||||
|
|
||||||
|
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small")
|
||||||
|
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cuda")
|
||||||
|
WHISPER_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "float16")
|
||||||
|
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "de")
|
||||||
|
|
||||||
|
ALLOWED_MODELS = {"tiny", "base", "small", "medium", "large-v3"}
|
||||||
|
|
||||||
|
|
||||||
|
class WhisperRunner:
|
||||||
|
"""Haelt das Whisper-Modell. Hot-Swap bei Konfig-Wechsel via ensure_loaded()."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.model_size: str = WHISPER_MODEL
|
||||||
|
self.model: Optional[WhisperModel] = None
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
|
||||||
|
def _load_blocking(self, size: str) -> None:
|
||||||
|
logger.info(
|
||||||
|
"Lade Whisper '%s' (device=%s, compute=%s)",
|
||||||
|
size, WHISPER_DEVICE, WHISPER_COMPUTE_TYPE,
|
||||||
|
)
|
||||||
|
t0 = time.time()
|
||||||
|
self.model = WhisperModel(
|
||||||
|
size, device=WHISPER_DEVICE, compute_type=WHISPER_COMPUTE_TYPE,
|
||||||
|
)
|
||||||
|
self.model_size = size
|
||||||
|
logger.info("Whisper '%s' geladen in %.1fs", size, time.time() - t0)
|
||||||
|
|
||||||
|
async def ensure_loaded(self, desired_size: str) -> None:
|
||||||
|
if desired_size not in ALLOWED_MODELS:
|
||||||
|
logger.warning("Ungueltiges Whisper-Modell '%s' — nutze %s", desired_size, WHISPER_MODEL)
|
||||||
|
desired_size = WHISPER_MODEL
|
||||||
|
async with self._lock:
|
||||||
|
if self.model is not None and self.model_size == desired_size:
|
||||||
|
return
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
await loop.run_in_executor(None, self._load_blocking, desired_size)
|
||||||
|
|
||||||
|
async def transcribe(self, audio: np.ndarray, language: str) -> tuple[str, float]:
|
||||||
|
if self.model is None:
|
||||||
|
return "", 0.0
|
||||||
|
|
||||||
|
def _run():
|
||||||
|
segments, info = self.model.transcribe(
|
||||||
|
audio, language=language, beam_size=5, vad_filter=True,
|
||||||
|
)
|
||||||
|
text = " ".join(seg.text.strip() for seg in segments)
|
||||||
|
return text, info.duration
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
return await loop.run_in_executor(None, _run)
|
||||||
|
|
||||||
|
|
||||||
|
def ffmpeg_to_float32(audio_b64: str, mime_type: str) -> np.ndarray:
|
||||||
|
"""Dekodiert beliebiges Audio-Format → 16kHz mono float32 PCM."""
|
||||||
|
if "mp4" in mime_type or "m4a" in mime_type or "aac" in mime_type:
|
||||||
|
ext = ".mp4"
|
||||||
|
elif "wav" in mime_type:
|
||||||
|
ext = ".wav"
|
||||||
|
elif "ogg" in mime_type or "opus" in mime_type:
|
||||||
|
ext = ".ogg"
|
||||||
|
else:
|
||||||
|
ext = ".bin"
|
||||||
|
|
||||||
|
in_fh = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
|
||||||
|
try:
|
||||||
|
in_fh.write(base64.b64decode(audio_b64))
|
||||||
|
in_fh.close()
|
||||||
|
out_path = in_fh.name + ".raw"
|
||||||
|
cmd = ["ffmpeg", "-y", "-i", in_fh.name, "-ar", "16000", "-ac", "1", "-f", "f32le", out_path]
|
||||||
|
result = subprocess.run(cmd, capture_output=True, timeout=30)
|
||||||
|
if result.returncode != 0:
|
||||||
|
logger.error("FFmpeg Fehler: %s", result.stderr.decode(errors="replace")[:300])
|
||||||
|
return np.zeros(0, dtype=np.float32)
|
||||||
|
try:
|
||||||
|
return np.fromfile(out_path, dtype=np.float32)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(out_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(in_fh.name)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
async def _send(ws, mtype: str, payload: dict) -> None:
|
||||||
|
try:
|
||||||
|
await ws.send(json.dumps({
|
||||||
|
"type": mtype,
|
||||||
|
"payload": payload,
|
||||||
|
"timestamp": int(time.time() * 1000),
|
||||||
|
}))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Send fehlgeschlagen (%s): %s", mtype, e)
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_stt_request(ws, payload: dict, runner: WhisperRunner) -> None:
|
||||||
|
request_id = payload.get("requestId", "")
|
||||||
|
audio_b64 = payload.get("audio", "")
|
||||||
|
mime_type = payload.get("mimeType", "audio/mp4")
|
||||||
|
model = payload.get("model") or WHISPER_MODEL
|
||||||
|
language = payload.get("language") or WHISPER_LANGUAGE
|
||||||
|
|
||||||
|
if not audio_b64:
|
||||||
|
await _send(ws, "stt_response", {"requestId": request_id, "error": "no-audio"})
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
t_load = time.time()
|
||||||
|
await runner.ensure_loaded(model)
|
||||||
|
load_ms = int((time.time() - t_load) * 1000)
|
||||||
|
|
||||||
|
audio = ffmpeg_to_float32(audio_b64, mime_type)
|
||||||
|
if audio.size == 0:
|
||||||
|
await _send(ws, "stt_response", {"requestId": request_id, "error": "ffmpeg-failed"})
|
||||||
|
return
|
||||||
|
duration_s = len(audio) / 16000.0
|
||||||
|
logger.info("STT-Request: %.1fs Audio, model=%s, lang=%s", duration_s, runner.model_size, language)
|
||||||
|
|
||||||
|
t_stt = time.time()
|
||||||
|
text, detected_duration = await runner.transcribe(audio, language)
|
||||||
|
stt_ms = int((time.time() - t_stt) * 1000)
|
||||||
|
|
||||||
|
logger.info("STT-Ergebnis (%dms): '%s'", stt_ms, text[:100])
|
||||||
|
|
||||||
|
await _send(ws, "stt_response", {
|
||||||
|
"requestId": request_id,
|
||||||
|
"text": text.strip(),
|
||||||
|
"durationS": duration_s,
|
||||||
|
"sttMs": stt_ms,
|
||||||
|
"loadMs": load_ms,
|
||||||
|
"model": runner.model_size,
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("STT-Request fehlgeschlagen")
|
||||||
|
await _send(ws, "stt_response", {
|
||||||
|
"requestId": request_id,
|
||||||
|
"error": str(e)[:200],
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
async def run_loop(runner: WhisperRunner) -> None:
|
||||||
|
# Modell vorab laden damit erste Anfrage flott ist
|
||||||
|
try:
|
||||||
|
await runner.ensure_loaded(WHISPER_MODEL)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Preload fehlgeschlagen: %s — Fortsetzung, wird bei erstem Request nachgeladen", e)
|
||||||
|
|
||||||
|
use_tls = RVS_TLS
|
||||||
|
retry_s = 2
|
||||||
|
tls_fallback_tried = False
|
||||||
|
|
||||||
|
while True:
|
||||||
|
scheme = "wss" if use_tls else "ws"
|
||||||
|
url = f"{scheme}://{RVS_HOST}:{RVS_PORT}/ws?token={RVS_TOKEN}"
|
||||||
|
masked = url.replace(RVS_TOKEN, "***") if RVS_TOKEN else url
|
||||||
|
try:
|
||||||
|
logger.info("Verbinde zu RVS: %s", masked)
|
||||||
|
async with websockets.connect(url, ping_interval=20, ping_timeout=10) as ws:
|
||||||
|
logger.info("RVS verbunden")
|
||||||
|
retry_s = 2
|
||||||
|
tls_fallback_tried = False
|
||||||
|
async for raw in ws:
|
||||||
|
try:
|
||||||
|
msg = json.loads(raw)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
mtype = msg.get("type", "")
|
||||||
|
payload = msg.get("payload", {}) or {}
|
||||||
|
|
||||||
|
if mtype == "stt_request":
|
||||||
|
asyncio.create_task(handle_stt_request(ws, payload, runner))
|
||||||
|
elif mtype == "config":
|
||||||
|
new_model = payload.get("whisperModel")
|
||||||
|
if new_model and new_model != runner.model_size:
|
||||||
|
logger.info("Config-Broadcast: Whisper-Modell → %s", new_model)
|
||||||
|
asyncio.create_task(runner.ensure_loaded(new_model))
|
||||||
|
# andere Types (chat, heartbeat, ...) einfach ignorieren
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Verbindung verloren: %s", e)
|
||||||
|
if use_tls and RVS_TLS_FALLBACK and not tls_fallback_tried:
|
||||||
|
logger.info("TLS-Verbindung fehlgeschlagen — Fallback auf ws://")
|
||||||
|
use_tls = False
|
||||||
|
tls_fallback_tried = True
|
||||||
|
continue
|
||||||
|
await asyncio.sleep(min(retry_s, 30))
|
||||||
|
retry_s = min(retry_s * 2, 30)
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
if not RVS_HOST:
|
||||||
|
logger.error("RVS_HOST ist nicht gesetzt — Abbruch")
|
||||||
|
sys.exit(1)
|
||||||
|
runner = WhisperRunner()
|
||||||
|
await run_loop(runner)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
asyncio.run(main())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
sys.exit(0)
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
faster-whisper==1.0.3
|
||||||
|
websockets>=12.0
|
||||||
|
numpy>=1.24
|
||||||
Reference in New Issue
Block a user