added audio workword, and recording, editied readme
This commit is contained in:
+69
-3
@@ -30,6 +30,7 @@ import wave
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import subprocess
|
||||
import urllib.request
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
@@ -959,13 +960,78 @@ class ARIABridge:
|
||||
await self.ws_core.send(raw_message)
|
||||
|
||||
elif msg_type == "audio":
|
||||
# Audio von der App → STT → an aria-core
|
||||
logger.info("[rvs] Audio empfangen — TODO: STT")
|
||||
# Spaeter: Audio decodieren, durch Whisper jagen, Ergebnis an core
|
||||
# Audio von der App → decodieren → STT → an aria-core
|
||||
audio_b64 = payload.get("base64", "")
|
||||
mime_type = payload.get("mimeType", "audio/mp4")
|
||||
duration_ms = payload.get("durationMs", 0)
|
||||
if not audio_b64:
|
||||
logger.warning("[rvs] Audio ohne Daten empfangen")
|
||||
return
|
||||
logger.info("[rvs] Audio empfangen: %s, %dms, %dKB",
|
||||
mime_type, duration_ms, len(audio_b64) // 1365)
|
||||
asyncio.create_task(self._process_app_audio(audio_b64, mime_type))
|
||||
|
||||
else:
|
||||
logger.debug("[rvs] Unbekannter Typ: %s", msg_type)
|
||||
|
||||
async def _process_app_audio(self, audio_b64: str, mime_type: str) -> None:
|
||||
"""Decodiert App-Audio (Base64 AAC/MP4), konvertiert zu 16kHz PCM, STT, sendet an core."""
|
||||
loop = asyncio.get_event_loop()
|
||||
tmp_in = None
|
||||
tmp_out = None
|
||||
try:
|
||||
# Base64 → temp-Datei
|
||||
ext = ".mp4" if "mp4" in mime_type else ".wav" if "wav" in mime_type else ".ogg"
|
||||
tmp_in = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
|
||||
tmp_in.write(base64.b64decode(audio_b64))
|
||||
tmp_in.close()
|
||||
|
||||
# FFmpeg: beliebiges Format → 16kHz mono PCM (raw float32)
|
||||
tmp_out = tempfile.NamedTemporaryFile(suffix=".raw", delete=False)
|
||||
tmp_out.close()
|
||||
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-i", tmp_in.name,
|
||||
"-ar", "16000", "-ac", "1", "-f", "f32le",
|
||||
tmp_out.name,
|
||||
]
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: subprocess.run(cmd, capture_output=True, timeout=30),
|
||||
)
|
||||
if result.returncode != 0:
|
||||
logger.error("[rvs] FFmpeg Fehler: %s", result.stderr.decode()[:200])
|
||||
return
|
||||
|
||||
# PCM lesen → numpy float32
|
||||
audio_data = np.fromfile(tmp_out.name, dtype=np.float32)
|
||||
if len(audio_data) == 0:
|
||||
logger.warning("[rvs] Leere Audio-Daten nach Konvertierung")
|
||||
return
|
||||
|
||||
duration_s = len(audio_data) / 16000.0
|
||||
logger.info("[rvs] Audio konvertiert: %.1fs, %d samples", duration_s, len(audio_data))
|
||||
|
||||
# STT
|
||||
text = await loop.run_in_executor(None, self.stt_engine.transcribe, audio_data)
|
||||
|
||||
if text.strip():
|
||||
logger.info("[rvs] STT Ergebnis: '%s'", text[:80])
|
||||
await self.send_to_core(text, source="app-voice")
|
||||
else:
|
||||
logger.info("[rvs] Keine Sprache erkannt — ignoriert")
|
||||
|
||||
except Exception:
|
||||
logger.exception("[rvs] Audio-Verarbeitung fehlgeschlagen")
|
||||
finally:
|
||||
# Temp-Dateien aufraeumen
|
||||
for f in [tmp_in, tmp_out]:
|
||||
if f:
|
||||
try:
|
||||
os.unlink(f.name)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
async def _send_to_rvs(self, message: dict) -> None:
|
||||
"""Sendet eine Nachricht an die App (via RVS)."""
|
||||
if self.ws_rvs is None or not self.ws_rvs.open:
|
||||
|
||||
Reference in New Issue
Block a user