feat: Piper komplett entfernt — nur noch XTTS v2 als TTS

Breaking Change: wenn XTTS-Bridge (Gaming-PC) offline ist, bleibt ARIA
stumm. Chat-Antworten kommen weiter an, aber kein Audio. Das ist
bewusst akzeptiert — XTTS klingt einfach grauenhaft viel besser.

Bridge (aria_bridge.py):
- from piper import ... raus
- VoiceEngine-Klasse komplett entfernt (synthesize, speak, select_voice)
- EPIC_TRIGGERS + load_epic_triggers raus (Highlight-Voice-Feature
  ohne Piper sinnlos)
- self.voice_engine, voice_name, requested_voice Aufrufe weg
- _process_core_response: immer XTTS, kein Fallback
- tts_request Handler: immer XTTS
- config Handler: nur ttsEnabled + xttsVoice + whisperModel
- import wave raus

bridge/requirements.txt: piper-tts raus
bridge/Dockerfile: Kommentar aktualisiert
docker-compose.yml: ./aria-data/voices Mount raus
aria-data/config/aria.env.example: PIPER_RAMONA/PIPER_THORSTEN raus
get-voices.sh: komplett geloescht (war nur Piper-Downloader)

Diagnostic UI (index.html):
- Piper Panel (Standard-Stimme / Highlight-Stimme / Speed-Sliders) weg
- TTS Engine Dropdown weg (immer XTTS)
- TTS Diagnose Tab zeigt nur noch XTTS-Status + Test-Button
- sendVoiceConfig sendet nur noch ttsEnabled/xttsVoice/whisperModel
- toggleXTTSPanel als no-op Legacy-Stub (JS-Calls bleiben safe)

Diagnostic Server (server.js):
- handleSendVoiceConfig: nur noch ttsEnabled + xttsVoice + whisperModel
- handleTestTTS: via xtts_request (nicht mehr Piper subprocess)
- handleCheckTTS: via xtts_list_voices ueber RVS
- handleGetVoiceConfig/Defaults bereinigt
- Highlight-Trigger UI bleibt, wird aber von Bridge nicht mehr
  ausgewertet (dead-code im UI, spaeter ggf. fuer XTTS-Voice-Switch)

README + issue.md aktualisiert.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-19 22:24:42 +02:00
parent 6ab6196739
commit f801d99748
10 changed files with 144 additions and 643 deletions
+1 -1
View File
@@ -1,6 +1,6 @@
# ════════════════════════════════════════════════
# ARIA Voice Bridge — Dockerfile
# Whisper STT + Piper TTS + Wake-Word
# Whisper STT + Wake-Word (TTS via XTTS v2 remote)
# ════════════════════════════════════════════════
FROM python:3.12-slim
+56 -369
View File
@@ -26,7 +26,6 @@ import ssl
import sys
import tempfile
import uuid
import wave
from pathlib import Path
from typing import Optional
@@ -37,8 +36,6 @@ import sounddevice as sd
import websockets
from faster_whisper import WhisperModel
from openwakeword.model import Model as WakeWordModel
from piper import PiperVoice
from piper.config import SynthesisConfig
from modes import Mode, detect_mode_switch, should_speak
@@ -72,38 +69,6 @@ CHANNELS = 1
BLOCK_SIZE = 1280 # 80ms bei 16kHz — gut fuer Wake-Word-Erkennung
RECORD_SECONDS = 8 # Max. Aufnahmedauer nach Wake-Word
# Epische Trigger — bei diesen Woertern spricht Thorsten
EPIC_TRIGGERS_DEFAULT = [
"deploy",
"erfolgreich",
"alarm",
"so soll es sein",
"kritisch",
"server down",
"sicherheitswarnung",
"ticket geloest",
"aufgabe abgeschlossen",
]
# Trigger aus Shared-Config laden (von Diagnostic gespeichert)
TRIGGERS_FILE = "/shared/config/highlight_triggers.json"
def load_epic_triggers():
"""Laedt Highlight-Trigger aus Shared-Config oder nutzt Defaults."""
try:
if os.path.exists(TRIGGERS_FILE):
with open(TRIGGERS_FILE) as f:
triggers = json.load(f)
if isinstance(triggers, list) and len(triggers) > 0:
logger.info("Highlight-Trigger geladen: %d aus %s", len(triggers), TRIGGERS_FILE)
return triggers
except Exception as e:
logger.warning("Highlight-Trigger laden fehlgeschlagen: %s — nutze Defaults", e)
return EPIC_TRIGGERS_DEFAULT
EPIC_TRIGGERS = load_epic_triggers()
def load_config() -> dict[str, str]:
"""Laedt Konfiguration.
@@ -290,179 +255,6 @@ def clean_text_for_tts(text: str) -> str:
return t.strip()
class VoiceEngine:
"""Verwaltet Piper TTS mit zwei Stimmen: Ramona und Thorsten."""
def __init__(self, voices_dir: Path) -> None:
self.voices_dir = voices_dir
self.voices: dict[str, PiperVoice] = {}
self.default_voice = "ramona"
self.highlight_voice = "thorsten"
self.speech_speed = {"ramona": 1.0, "thorsten": 1.0}
def initialize(self) -> None:
"""Laedt die Piper-Stimmen aus dem Voices-Verzeichnis."""
voice_configs = {
"ramona": "de_DE-ramona-low",
"thorsten": "de_DE-thorsten-high",
}
for name, model_name in voice_configs.items():
model_path = self.voices_dir / f"{model_name}.onnx"
config_path = self.voices_dir / f"{model_name}.onnx.json"
if not model_path.exists():
logger.error("Stimme nicht gefunden: %s", model_path)
continue
self.voices[name] = PiperVoice.load(
str(model_path),
config_path=str(config_path) if config_path.exists() else None,
)
logger.info("Stimme geladen: %s (%s)", name, model_name)
if not self.voices:
logger.error("Keine Stimmen geladen — TTS deaktiviert")
def select_voice(
self, text: str, requested_voice: Optional[str] = None
) -> str:
"""Waehlt die passende Stimme basierend auf Text oder Anfrage.
Thorsten wird bei epischen Triggern verwendet,
sonst Ramona als Standardstimme.
Args:
text: Der zu sprechende Text (fuer Epic-Trigger-Erkennung).
requested_voice: Explizit angeforderte Stimme ("ramona" | "thorsten").
Returns:
Name der gewaehlten Stimme.
"""
if requested_voice and requested_voice in self.voices:
return requested_voice
# Highlight-Trigger pruefen
text_lower = text.lower()
for trigger in EPIC_TRIGGERS:
if trigger in text_lower:
logger.info("Highlight-Trigger erkannt: '%s'%s spricht", trigger, self.highlight_voice)
return self.highlight_voice
return self.default_voice
def synthesize(self, text: str, voice_name: str = "ramona") -> Optional[bytes]:
"""Erzeugt Audio-Daten aus Text mit der gewaehlten Stimme.
Args:
text: Der zu sprechende Text.
voice_name: Name der Stimme ("ramona" oder "thorsten").
Returns:
WAV-Audiodaten als bytes oder None bei Fehler.
"""
voice = self.voices.get(voice_name)
if voice is None:
logger.error("Stimme '%s' nicht verfuegbar", voice_name)
return None
try:
# Zentraler TTS-Cleanup (Markdown, Code, Einheiten, URLs)
import re
clean = clean_text_for_tts(text)
sentences = re.split(r'(?<=[.!?])\s+', clean)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return None
# Jeden Satz einzeln synthetisieren und WAVs zusammenfuegen
all_audio = b""
sample_rate = None
for sentence in sentences:
if not sentence:
continue
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp_path = tmp.name
speed = self.speech_speed.get(voice_name, 1.0)
syn_config = SynthesisConfig(length_scale=1.0 / max(0.3, speed))
with wave.open(tmp_path, "wb") as wav_file:
voice.synthesize_wav(sentence, wav_file, syn_config=syn_config)
with wave.open(tmp_path, "rb") as wav_file:
if sample_rate is None:
sample_rate = wav_file.getframerate()
all_audio += wav_file.readframes(wav_file.getnframes())
Path(tmp_path).unlink(missing_ok=True)
# Zusammengefuegtes WAV erstellen
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
final_path = tmp.name
with wave.open(final_path, "wb") as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate or 22050)
wav_file.writeframes(all_audio)
audio_data = Path(final_path).read_bytes()
Path(final_path).unlink(missing_ok=True)
logger.info(
"TTS: %d bytes erzeugt mit %s (%d Saetze) — '%s'",
len(audio_data),
voice_name,
len(sentences),
text[:60],
)
return audio_data
except Exception:
logger.exception("TTS-Fehler bei Stimme '%s'", voice_name)
return None
def speak(self, text: str, requested_voice: Optional[str] = None) -> None:
"""Spricht den Text ueber das Audio-Geraet.
Waehlt automatisch die passende Stimme und gibt das Audio aus.
Args:
text: Der zu sprechende Text.
requested_voice: Optionale explizite Stimmenwahl.
"""
voice_name = self.select_voice(text, requested_voice)
audio_data = self.synthesize(text, voice_name)
if audio_data is None:
return
try:
# WAV-Daten lesen und ueber sounddevice abspielen
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(audio_data)
tmp_path = tmp.name
with wave.open(tmp_path, "rb") as wf:
frames = wf.readframes(wf.getnframes())
sample_width = wf.getsampwidth()
rate = wf.getframerate()
channels = wf.getnchannels()
Path(tmp_path).unlink(missing_ok=True)
# Numpy-Array aus PCM-Daten
dtype_map = {1: np.int8, 2: np.int16, 4: np.int32}
dtype = dtype_map.get(sample_width, np.int16)
audio_array = np.frombuffer(frames, dtype=dtype)
if channels > 1:
audio_array = audio_array.reshape(-1, channels)
sd.play(audio_array, samplerate=rate)
sd.wait() # Warten bis Wiedergabe fertig
except Exception:
logger.exception("Audio-Wiedergabe fehlgeschlagen")
# ── STT Engine ───────────────────────────────────────────────
@@ -672,9 +464,9 @@ class ARIABridge:
self.current_mode = Mode.NORMAL
self.running = False
# Komponenten
self.voice_engine = VoiceEngine(VOICES_DIR)
# Komponenten (TTS: immer XTTS remote, Piper wurde entfernt)
self.tts_enabled = True
self.xtts_voice = ""
vc: dict = {}
# Gespeicherte Voice-Config laden
try:
@@ -682,16 +474,9 @@ class ARIABridge:
if os.path.exists(vc_path):
with open(vc_path) as f:
vc = json.load(f)
self.voice_engine.default_voice = vc.get("defaultVoice", "ramona")
self.voice_engine.highlight_voice = vc.get("highlightVoice", "thorsten")
self.voice_engine.speech_speed = {
"ramona": vc.get("speedRamona", 1.0),
"thorsten": vc.get("speedThorsten", 1.0),
}
self.tts_enabled = vc.get("ttsEnabled", True)
self.tts_engine_type = vc.get("ttsEngine", "piper")
self.xtts_voice = vc.get("xttsVoice", "")
logger.info("Voice-Config geladen: %s", vc)
logger.info("Voice-Config geladen: tts=%s voice=%s", self.tts_enabled, self.xtts_voice or "default")
except Exception as e:
logger.warning("Voice-Config laden fehlgeschlagen: %s", e)
# Whisper-Modell: Config hat Vorrang, dann env/Default (medium)
@@ -725,9 +510,6 @@ class ARIABridge:
logger.info("ARIA Voice Bridge startet...")
logger.info("=" * 50)
# Voice-Engine IMMER laden — rendert Audio fuer die App (auch ohne Soundkarte)
self.voice_engine.initialize()
# STT IMMER laden — verarbeitet Audio von der App (braucht kein Sounddevice)
self.stt_engine.initialize()
@@ -1050,9 +832,6 @@ class ARIABridge:
"timestamp": int(asyncio.get_event_loop().time() * 1000),
})
# Stimme auswaehlen
voice_name = requested_voice or self.voice_engine.select_voice(text)
# Eindeutige Message-ID fuer Audio-Cache-Zuordnung
message_id = str(uuid.uuid4())
@@ -1065,7 +844,6 @@ class ARIABridge:
"payload": {
"text": text,
"sender": "aria",
"voice": voice_name,
"messageId": message_id,
# Debug: aufbereiteter Text fuer TTS (App ignoriert, Diagnostic zeigt optional)
"ttsText": tts_text_preview if tts_text_preview != text else "",
@@ -1073,69 +851,37 @@ class ARIABridge:
"timestamp": int(asyncio.get_event_loop().time() * 1000),
})
# TTS-Audio rendern und an die App senden (wenn Modus es erlaubt)
if getattr(self, 'tts_enabled', True) and should_speak(self.current_mode, is_critical):
tts_engine = getattr(self, 'tts_engine_type', 'piper')
if tts_engine == "xtts":
# XTTS: aufbereiteter Text (Code-Bloecke raus, Einheiten ausgeschrieben)
xtts_voice = getattr(self, 'xtts_voice', '')
tts_text = clean_text_for_tts(text)
if not tts_text:
logger.info("[core] TTS-Text leer nach Cleanup — XTTS uebersprungen")
return
try:
xtts_request_id = str(uuid.uuid4())
# Map fuer xtts_response → App-Cache Zuordnung
self._xtts_request_to_message[xtts_request_id] = message_id
if len(self._xtts_request_to_message) > 100:
# Oldest entry raus damit der Dict nicht waechst
oldest = next(iter(self._xtts_request_to_message))
self._xtts_request_to_message.pop(oldest, None)
await self._send_to_rvs({
"type": "xtts_request",
"payload": {
"text": tts_text,
"voice": xtts_voice,
"language": "de",
"requestId": xtts_request_id,
},
"timestamp": int(asyncio.get_event_loop().time() * 1000),
})
logger.info("[core] XTTS-Request gesendet (%s): '%s'", xtts_voice or "default", tts_text[:60])
except Exception as e:
logger.warning("[core] XTTS-Request fehlgeschlagen: %s — Fallback auf Piper", e)
# Fallback auf Piper
audio_data = self.voice_engine.synthesize(text, voice_name)
if audio_data:
audio_b64 = base64.b64encode(audio_data).decode("ascii")
await self._send_to_rvs({
"type": "audio",
"payload": {"base64": audio_b64, "mimeType": "audio/wav", "voice": voice_name, "messageId": message_id},
"timestamp": int(asyncio.get_event_loop().time() * 1000),
})
else:
# Piper: Lokal rendern
audio_data = self.voice_engine.synthesize(text, voice_name)
if audio_data:
audio_b64 = base64.b64encode(audio_data).decode("ascii")
await self._send_to_rvs({
"type": "audio",
"payload": {
"base64": audio_b64,
"mimeType": "audio/wav",
"voice": voice_name,
"messageId": message_id,
},
"timestamp": int(asyncio.get_event_loop().time() * 1000),
})
logger.info("[core] TTS-Audio gesendet: %d bytes (%s)", len(audio_data), voice_name)
# Lokal abspielen (nur wenn Soundkarte vorhanden)
if self.audio_available:
self.voice_engine.speak(text, requested_voice)
else:
# TTS ueber XTTS (XTTS-Bridge auf Gaming-PC)
if not (getattr(self, 'tts_enabled', True) and should_speak(self.current_mode, is_critical)):
logger.info("[core] TTS unterdrueckt (Modus: %s)", self.current_mode.config.name)
return
xtts_voice = getattr(self, 'xtts_voice', '')
tts_text = tts_text_preview or text
if not tts_text:
logger.info("[core] TTS-Text leer nach Cleanup — uebersprungen")
return
try:
xtts_request_id = str(uuid.uuid4())
# Map fuer audio_pcm/xtts_response → App-Cache Zuordnung
self._xtts_request_to_message[xtts_request_id] = message_id
if len(self._xtts_request_to_message) > 100:
oldest = next(iter(self._xtts_request_to_message))
self._xtts_request_to_message.pop(oldest, None)
await self._send_to_rvs({
"type": "xtts_request",
"payload": {
"text": tts_text,
"voice": xtts_voice,
"language": "de",
"requestId": xtts_request_id,
"messageId": message_id,
},
"timestamp": int(asyncio.get_event_loop().time() * 1000),
})
logger.info("[core] XTTS-Request gesendet (%s): '%s'", xtts_voice or "default", tts_text[:60])
except Exception as e:
logger.error("[core] XTTS-Request fehlgeschlagen: %s — kein Audio", e)
def _fetch_active_session(self) -> None:
"""Holt die aktive Session vom Diagnostic-Endpoint."""
@@ -1344,113 +1090,58 @@ class ARIABridge:
return
elif msg_type == "tts_request":
# App fordert TTS-Audio fuer einen Text an (Play-Button).
# Nutze die aktuell konfigurierte Engine (Piper oder XTTS).
# App fordert TTS-Audio fuer einen Text an (Play-Button) → immer XTTS.
text = payload.get("text", "")
requested_voice = payload.get("voice", "")
message_id = payload.get("messageId", "") # fuer Cache-Zuordnung
message_id = payload.get("messageId", "")
if not text:
return
tts_engine = getattr(self, 'tts_engine_type', 'piper')
tts_text = clean_text_for_tts(text) or text
if tts_engine == "xtts":
xtts_voice = getattr(self, 'xtts_voice', '')
try:
await self._send_to_rvs({
"type": "xtts_request",
"payload": {
"text": tts_text,
"voice": xtts_voice,
"language": "de",
"requestId": str(uuid.uuid4()),
"messageId": message_id,
},
"timestamp": int(asyncio.get_event_loop().time() * 1000),
})
logger.info("[rvs] TTS on-demand via XTTS: '%s'", tts_text[:60])
except Exception as e:
logger.warning("[rvs] XTTS-Request fehlgeschlagen, Fallback Piper: %s", e)
tts_engine = "piper"
if tts_engine == "piper":
voice_name = requested_voice or self.voice_engine.select_voice(text)
audio_data = self.voice_engine.synthesize(text, voice_name)
if audio_data:
audio_b64 = base64.b64encode(audio_data).decode("ascii")
try:
await self._send_to_rvs({
"type": "audio",
"payload": {
"base64": audio_b64,
"mimeType": "audio/wav",
"voice": voice_name,
"messageId": message_id,
},
"timestamp": int(asyncio.get_event_loop().time() * 1000),
})
logger.info("[rvs] TTS on-demand via Piper: %d bytes (%s)", len(audio_data), voice_name)
except Exception as e:
logger.warning("[rvs] TTS on-demand senden fehlgeschlagen: %s", e)
xtts_voice = getattr(self, 'xtts_voice', '')
try:
xtts_request_id = str(uuid.uuid4())
if message_id:
self._xtts_request_to_message[xtts_request_id] = message_id
await self._send_to_rvs({
"type": "xtts_request",
"payload": {
"text": tts_text,
"voice": xtts_voice,
"language": "de",
"requestId": xtts_request_id,
"messageId": message_id,
},
"timestamp": int(asyncio.get_event_loop().time() * 1000),
})
logger.info("[rvs] TTS on-demand via XTTS: '%s'", tts_text[:60])
except Exception as e:
logger.warning("[rvs] TTS on-demand fehlgeschlagen: %s", e)
return
elif msg_type == "config":
# Konfiguration von App/Diagnostic empfangen + persistent speichern
changed = False
if "defaultVoice" in payload:
new_voice = payload["defaultVoice"]
if new_voice in self.voice_engine.voices:
self.voice_engine.default_voice = new_voice
logger.info("[rvs] Standard-Stimme gewechselt: %s", new_voice)
changed = True
if "highlightVoice" in payload:
new_voice = payload["highlightVoice"]
if new_voice in self.voice_engine.voices:
self.voice_engine.highlight_voice = new_voice
logger.info("[rvs] Highlight-Stimme gewechselt: %s", new_voice)
changed = True
if "ttsEnabled" in payload:
self.tts_enabled = bool(payload["ttsEnabled"])
logger.info("[rvs] TTS %s", "aktiviert" if self.tts_enabled else "deaktiviert")
changed = True
if "ttsEngine" in payload:
self.tts_engine_type = payload["ttsEngine"]
logger.info("[rvs] TTS-Engine: %s", self.tts_engine_type)
changed = True
if "xttsVoice" in payload:
self.xtts_voice = payload["xttsVoice"]
logger.info("[rvs] XTTS-Stimme: %s", self.xtts_voice)
logger.info("[rvs] XTTS-Stimme: %s", self.xtts_voice or "default")
changed = True
if "speedRamona" in payload:
self.voice_engine.speech_speed["ramona"] = max(0.3, min(2.0, float(payload["speedRamona"])))
logger.info("[rvs] Speed Ramona: %.1f", self.voice_engine.speech_speed["ramona"])
changed = True
if "speedThorsten" in payload:
self.voice_engine.speech_speed["thorsten"] = max(0.3, min(2.0, float(payload["speedThorsten"])))
logger.info("[rvs] Speed Thorsten: %.1f", self.voice_engine.speech_speed["thorsten"])
changed = True
whisper_reloaded = False
if "whisperModel" in payload:
new_model = payload["whisperModel"]
if new_model and new_model != self.stt_engine.model_size:
logger.info("[rvs] Whisper-Modell Wechsel: %s -> %s (laedt...)", self.stt_engine.model_size, new_model)
loop = asyncio.get_event_loop()
whisper_reloaded = await loop.run_in_executor(None, self.stt_engine.reload, new_model)
if whisper_reloaded:
if await loop.run_in_executor(None, self.stt_engine.reload, new_model):
changed = True
# Persistent speichern in Shared Volume
if changed:
try:
os.makedirs("/shared/config", exist_ok=True)
config_data = {
"defaultVoice": self.voice_engine.default_voice,
"highlightVoice": self.voice_engine.highlight_voice,
"ttsEnabled": getattr(self, "tts_enabled", True),
"ttsEngine": getattr(self, "tts_engine_type", "piper"),
"xttsVoice": getattr(self, "xtts_voice", ""),
"speedRamona": self.voice_engine.speech_speed.get("ramona", 1.0),
"speedThorsten": self.voice_engine.speech_speed.get("thorsten", 1.0),
"whisperModel": self.stt_engine.model_size,
}
with open("/shared/config/voice_config.json", "w") as f:
@@ -1459,10 +1150,6 @@ class ARIABridge:
except Exception as e:
logger.warning("[rvs] Config speichern fehlgeschlagen: %s", e)
return
text = payload.get("text", "")
if text:
logger.info("[rvs] App-Chat: '%s'", text[:80])
await self.send_to_core(text, source="app")
elif msg_type == "mode":
# Moduswechsel von der App
+1 -2
View File
@@ -5,8 +5,7 @@
# STT — Whisper (lokal, keine API noetig)
faster-whisper
# TTS — Piper (offline, deutsche Stimmen)
piper-tts
# TTS: laeuft remote ueber XTTS v2 auf dem Gaming-PC (keine lokalen Deps noetig)
# WebSocket-Verbindung zu aria-core
websockets