feat: NO_REPLY-Filter + Audio-Ducking + TTS-Cleanup
1) NO_REPLY Token wird in Bridge und Diagnostic erkannt und still verworfen. Toleranz fuer Variationen (Whitespace, Punkt, Quotes). Kein Chat-Eintrag, kein TTS. 2) AudioFocusModule (Kotlin) mit requestDuck / requestExclusive / release. AudioService ruft: - requestExclusive() bei Aufnahme-Start → andere Apps pausieren - requestDuck() bei TTS-Playback-Start → andere Apps leiser - release() bei Stop/Queue-Ende MainApplication registriert AudioFocusPackage. 3) clean_text_for_tts() in Bridge — zentrale Aufbereitung: - <voice>...</voice> Tag wird bevorzugt (falls ARIA es schreibt) - Code-Bloecke (``` und `) komplett raus - Markdown (Fett/Kursiv/Links/Headings/Listen) geschleift - Einheiten ausgeschrieben: 22GB → 22 Gigabyte, 85% → 85 Prozent - Abkuerzungen buchstabiert: CPU → C P U, API → A P I - URLs durch "ein Link" ersetzt Genutzt in VoiceEngine.synthesize und im XTTS-Request — Chat-Text an die App bleibt unveraendert (original Markdown), nur TTS kriegt die aufbereitete Version. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+108
-17
@@ -124,6 +124,97 @@ def load_config() -> dict[str, str]:
|
||||
# ── Voice Engine ─────────────────────────────────────────────
|
||||
|
||||
|
||||
import re as _re_tts
|
||||
|
||||
_UNIT_WORDS = [
|
||||
(r'\bTB\b', 'Terabyte'),
|
||||
(r'\bGB\b', 'Gigabyte'),
|
||||
(r'\bMB\b', 'Megabyte'),
|
||||
(r'\bKB\b', 'Kilobyte'),
|
||||
(r'\bkB\b', 'Kilobyte'),
|
||||
(r'\bms\b', 'Millisekunden'),
|
||||
(r'\bkm/h\b', 'Kilometer pro Stunde'),
|
||||
(r'\bkm\b', 'Kilometer'),
|
||||
(r'\bm/s\b', 'Meter pro Sekunde'),
|
||||
(r'\bkg\b', 'Kilogramm'),
|
||||
(r'\b°C\b', 'Grad Celsius'),
|
||||
(r'°C', ' Grad Celsius'),
|
||||
(r'\bMbps\b', 'Megabit pro Sekunde'),
|
||||
(r'\bGbps\b', 'Gigabit pro Sekunde'),
|
||||
(r'\bMhz\b|\bMHz\b', 'Megahertz'),
|
||||
(r'\bGhz\b|\bGHz\b', 'Gigahertz'),
|
||||
(r'%', ' Prozent'),
|
||||
(r'\bCPU\b', 'C P U'),
|
||||
(r'\bGPU\b', 'G P U'),
|
||||
(r'\bRAM\b', 'R A M'),
|
||||
(r'\bSSD\b', 'S S D'),
|
||||
(r'\bHDD\b', 'H D D'),
|
||||
(r'\bURL\b', 'U R L'),
|
||||
(r'\bAPI\b', 'A P I'),
|
||||
(r'\bRVS\b', 'R V S'),
|
||||
(r'\bSSH\b', 'S S H'),
|
||||
(r'\bVM\b', 'V M'),
|
||||
(r'\bUI\b', 'U I'),
|
||||
(r'\bTTS\b', 'T T S'),
|
||||
(r'\bSTT\b', 'S T T'),
|
||||
(r'\bTLS\b', 'T L S'),
|
||||
]
|
||||
|
||||
|
||||
def clean_text_for_tts(text: str) -> str:
|
||||
"""Bereitet Chat-Text fuer Sprachausgabe auf.
|
||||
|
||||
- `<voice>...</voice>` Tag: wenn vorhanden, NUR dieser Inhalt wird gelesen
|
||||
- Code-Bloecke (```...``` und `...`) werden komplett entfernt
|
||||
- Markdown (Fett, Kursiv, Links, Headings, Listen, Zitate) wird abgeraeumt
|
||||
- Einheiten und gaengige Abkuerzungen werden ausgeschrieben (22GB → 22 Gigabyte)
|
||||
- URLs werden durch "ein Link" ersetzt
|
||||
- Mehrfach-Leerzeichen/Umbrueche normalisiert
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# <voice>...</voice> wenn vorhanden → nur das nehmen
|
||||
voice_match = _re_tts.search(r'<voice>([\s\S]*?)</voice>', text, _re_tts.IGNORECASE)
|
||||
if voice_match:
|
||||
text = voice_match.group(1)
|
||||
|
||||
t = text
|
||||
|
||||
# Code-Bloecke komplett raus (Zeilenumbruch statt Platzhalter — sonst bricht Satzlogik)
|
||||
t = _re_tts.sub(r'```[\s\S]*?```', '. ', t)
|
||||
t = _re_tts.sub(r'`[^`]+`', '', t)
|
||||
|
||||
# Markdown
|
||||
t = _re_tts.sub(r'\*\*([^*]+)\*\*', r'\1', t)
|
||||
t = _re_tts.sub(r'\*([^*]+)\*', r'\1', t)
|
||||
t = _re_tts.sub(r'__([^_]+)__', r'\1', t)
|
||||
t = _re_tts.sub(r'\[([^\]]+)\]\((https?://[^)]+)\)', r'\1, ein Link', t)
|
||||
t = _re_tts.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', t)
|
||||
t = _re_tts.sub(r'https?://\S+', 'ein Link', t)
|
||||
t = _re_tts.sub(r'^#{1,6}\s*', '', t, flags=_re_tts.MULTILINE)
|
||||
t = _re_tts.sub(r'^>\s*', '', t, flags=_re_tts.MULTILINE)
|
||||
t = _re_tts.sub(r'^[\-\*]\s+', '', t, flags=_re_tts.MULTILINE)
|
||||
|
||||
# Zahlen + Einheit: "22GB" → "22 Gigabyte" (Leerzeichen einfuegen)
|
||||
t = _re_tts.sub(r'(\d+)([A-Za-z]{1,4})\b', r'\1 \2', t)
|
||||
|
||||
# Einheiten/Abkuerzungen ausschreiben
|
||||
for pat, repl in _UNIT_WORDS:
|
||||
t = _re_tts.sub(pat, repl, t)
|
||||
|
||||
# Anfuehrungszeichen
|
||||
t = _re_tts.sub(r'["""„`]', '', t)
|
||||
|
||||
# Absaetze/Zeilenumbrueche normalisieren
|
||||
t = _re_tts.sub(r'\n{2,}', '. ', t)
|
||||
t = _re_tts.sub(r'\n', ', ', t)
|
||||
t = _re_tts.sub(r'\s{2,}', ' ', t)
|
||||
t = _re_tts.sub(r'\s*\.\s*\.\s*', '. ', t)
|
||||
|
||||
return t.strip()
|
||||
|
||||
|
||||
class VoiceEngine:
|
||||
"""Verwaltet Piper TTS mit zwei Stimmen: Ramona und Thorsten."""
|
||||
|
||||
@@ -201,21 +292,9 @@ class VoiceEngine:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Markdown + Sonderzeichen entfernen fuer natuerliche Sprache
|
||||
# Zentraler TTS-Cleanup (Markdown, Code, Einheiten, URLs)
|
||||
import re
|
||||
clean = text.strip()
|
||||
clean = re.sub(r'\*\*([^*]+)\*\*', r'\1', clean) # **fett**
|
||||
clean = re.sub(r'\*([^*]+)\*', r'\1', clean) # *kursiv*
|
||||
clean = re.sub(r'`[^`]+`', '', clean) # `code`
|
||||
clean = re.sub(r'```[\s\S]*?```', '', clean) # Code-Bloecke
|
||||
clean = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', clean) # [text](url)
|
||||
clean = re.sub(r'#{1,6}\s*', '', clean) # ### Ueberschriften
|
||||
clean = re.sub(r'>\s*', '', clean) # > Zitate
|
||||
clean = re.sub(r'[-*]\s+', '', clean) # Listen
|
||||
clean = re.sub(r'\n{2,}', '. ', clean) # Absaetze
|
||||
clean = re.sub(r'\n', ', ', clean) # Zeilenumbrueche
|
||||
clean = re.sub(r'\s{2,}', ' ', clean) # Mehrfach-Leerzeichen
|
||||
clean = re.sub(r'["""„]', '', clean) # Anfuehrungszeichen
|
||||
clean = clean_text_for_tts(text)
|
||||
sentences = re.split(r'(?<=[.!?])\s+', clean)
|
||||
sentences = [s.strip() for s in sentences if s.strip()]
|
||||
|
||||
@@ -867,6 +946,14 @@ class ARIABridge:
|
||||
- Leitet Antwort an die App weiter (via RVS)
|
||||
- Sprachausgabe ueber TTS (wenn Modus erlaubt)
|
||||
"""
|
||||
# NO_REPLY Token: ARIA signalisiert explizit "nicht antworten"
|
||||
# → komplett verwerfen (keine Chat-Nachricht, kein TTS)
|
||||
# Toleranz fuer Variationen: "NO_REPLY", "no_reply", mit Punkt/Anfuehrungszeichen
|
||||
stripped = text.strip().strip('."\'`*').upper()
|
||||
if stripped == "NO_REPLY" or stripped.startswith("NO_REPLY"):
|
||||
logger.info("[core] NO_REPLY empfangen — Antwort still verworfen")
|
||||
return
|
||||
|
||||
metadata = payload.get("metadata", {})
|
||||
is_critical = metadata.get("critical", False)
|
||||
requested_voice = metadata.get("voice")
|
||||
@@ -905,20 +992,24 @@ class ARIABridge:
|
||||
tts_engine = getattr(self, 'tts_engine_type', 'piper')
|
||||
|
||||
if tts_engine == "xtts":
|
||||
# XTTS: Ganzen Text senden, XTTS-Bridge teilt satzweise auf
|
||||
# XTTS: aufbereiteter Text (Code-Bloecke raus, Einheiten ausgeschrieben)
|
||||
xtts_voice = getattr(self, 'xtts_voice', '')
|
||||
tts_text = clean_text_for_tts(text)
|
||||
if not tts_text:
|
||||
logger.info("[core] TTS-Text leer nach Cleanup — XTTS uebersprungen")
|
||||
return
|
||||
try:
|
||||
await self._send_to_rvs({
|
||||
"type": "xtts_request",
|
||||
"payload": {
|
||||
"text": text,
|
||||
"text": tts_text,
|
||||
"voice": xtts_voice,
|
||||
"language": "de",
|
||||
"requestId": str(uuid.uuid4()),
|
||||
},
|
||||
"timestamp": int(asyncio.get_event_loop().time() * 1000),
|
||||
})
|
||||
logger.info("[core] XTTS-Request gesendet (%s): '%s'", xtts_voice or "default", text[:60])
|
||||
logger.info("[core] XTTS-Request gesendet (%s): '%s'", xtts_voice or "default", tts_text[:60])
|
||||
except Exception as e:
|
||||
logger.warning("[core] XTTS-Request fehlgeschlagen: %s — Fallback auf Piper", e)
|
||||
# Fallback auf Piper
|
||||
|
||||
Reference in New Issue
Block a user