added audio workword, and recording, editied readme

This commit is contained in:
2026-03-29 11:29:15 +02:00
parent b687f790ba
commit dbd97d3cf4
15 changed files with 912 additions and 798 deletions
+69 -3
View File
@@ -30,6 +30,7 @@ import wave
from pathlib import Path
from typing import Optional
import subprocess
import urllib.request
import numpy as np
import sounddevice as sd
@@ -959,13 +960,78 @@ class ARIABridge:
await self.ws_core.send(raw_message)
elif msg_type == "audio":
# Audio von der App → STT → an aria-core
logger.info("[rvs] Audio empfangen — TODO: STT")
# Spaeter: Audio decodieren, durch Whisper jagen, Ergebnis an core
# Audio von der App → decodieren → STT → an aria-core
audio_b64 = payload.get("base64", "")
mime_type = payload.get("mimeType", "audio/mp4")
duration_ms = payload.get("durationMs", 0)
if not audio_b64:
logger.warning("[rvs] Audio ohne Daten empfangen")
return
logger.info("[rvs] Audio empfangen: %s, %dms, %dKB",
mime_type, duration_ms, len(audio_b64) // 1365)
asyncio.create_task(self._process_app_audio(audio_b64, mime_type))
else:
logger.debug("[rvs] Unbekannter Typ: %s", msg_type)
async def _process_app_audio(self, audio_b64: str, mime_type: str) -> None:
"""Decodiert App-Audio (Base64 AAC/MP4), konvertiert zu 16kHz PCM, STT, sendet an core."""
loop = asyncio.get_event_loop()
tmp_in = None
tmp_out = None
try:
# Base64 → temp-Datei
ext = ".mp4" if "mp4" in mime_type else ".wav" if "wav" in mime_type else ".ogg"
tmp_in = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
tmp_in.write(base64.b64decode(audio_b64))
tmp_in.close()
# FFmpeg: beliebiges Format → 16kHz mono PCM (raw float32)
tmp_out = tempfile.NamedTemporaryFile(suffix=".raw", delete=False)
tmp_out.close()
cmd = [
"ffmpeg", "-y", "-i", tmp_in.name,
"-ar", "16000", "-ac", "1", "-f", "f32le",
tmp_out.name,
]
result = await loop.run_in_executor(
None,
lambda: subprocess.run(cmd, capture_output=True, timeout=30),
)
if result.returncode != 0:
logger.error("[rvs] FFmpeg Fehler: %s", result.stderr.decode()[:200])
return
# PCM lesen → numpy float32
audio_data = np.fromfile(tmp_out.name, dtype=np.float32)
if len(audio_data) == 0:
logger.warning("[rvs] Leere Audio-Daten nach Konvertierung")
return
duration_s = len(audio_data) / 16000.0
logger.info("[rvs] Audio konvertiert: %.1fs, %d samples", duration_s, len(audio_data))
# STT
text = await loop.run_in_executor(None, self.stt_engine.transcribe, audio_data)
if text.strip():
logger.info("[rvs] STT Ergebnis: '%s'", text[:80])
await self.send_to_core(text, source="app-voice")
else:
logger.info("[rvs] Keine Sprache erkannt — ignoriert")
except Exception:
logger.exception("[rvs] Audio-Verarbeitung fehlgeschlagen")
finally:
# Temp-Dateien aufraeumen
for f in [tmp_in, tmp_out]:
if f:
try:
os.unlink(f.name)
except OSError:
pass
async def _send_to_rvs(self, message: dict) -> None:
"""Sendet eine Nachricht an die App (via RVS)."""
if self.ws_rvs is None or not self.ws_rvs.open: