diff --git a/android/src/screens/ChatScreen.tsx b/android/src/screens/ChatScreen.tsx index fc85c7d..0fb589a 100644 --- a/android/src/screens/ChatScreen.tsx +++ b/android/src/screens/ChatScreen.tsx @@ -636,7 +636,7 @@ const ChatScreen: React.FC = () => { {item.text} )} - {/* Play-Button fuer ARIA-Nachrichten — Cache bevorzugt, sonst Regenerierung */} + {/* Play-Button fuer ARIA-Nachrichten — Cache bevorzugt, sonst Bridge-TTS mit aktueller Engine */} {!isUser && item.text.length > 0 && ( { if (item.audioPath) { audioService.playFromPath(item.audioPath); } else { - rvs.send('tts_request' as any, { text: item.text, voice: '' }); + // messageId mitschicken damit die Bridge das generierte Audio + // wieder mit der Nachricht verknuepft (fuer den naechsten Replay aus Cache) + rvs.send('tts_request' as any, { + text: item.text, + voice: '', + messageId: item.messageId || '', + }); } }} > - {item.audioPath ? '\uD83D\uDD0A' : '\uD83D\uDD0A'} + {'\uD83D\uDD0A'} )} {time} diff --git a/bridge/aria_bridge.py b/bridge/aria_bridge.py index 7d0c985..9589f09 100644 --- a/bridge/aria_bridge.py +++ b/bridge/aria_bridge.py @@ -145,6 +145,46 @@ def load_config() -> dict[str, str]: import re as _re_tts +_NUM_WORDS_DE = { + 0: "null", 1: "eins", 2: "zwei", 3: "drei", 4: "vier", 5: "fuenf", + 6: "sechs", 7: "sieben", 8: "acht", 9: "neun", 10: "zehn", + 11: "elf", 12: "zwoelf", 13: "dreizehn", 14: "vierzehn", 15: "fuenfzehn", + 16: "sechzehn", 17: "siebzehn", 18: "achtzehn", 19: "neunzehn", 20: "zwanzig", +} +_TENS_DE = {30: "dreissig", 40: "vierzig", 50: "fuenfzig"} + + +def _num_to_words_de(n: int) -> str: + """Zahlen 0-59 als deutsches Wort — fuer Uhrzeiten und kleine Bereiche.""" + if n in _NUM_WORDS_DE: + return _NUM_WORDS_DE[n] + if 21 <= n <= 29: + return f"{_NUM_WORDS_DE[n - 20]}undzwanzig" + if 30 <= n <= 59: + tens = (n // 10) * 10 + ones = n % 10 + tens_word = _TENS_DE.get(tens, str(tens)) + if ones == 0: + return tens_word + return f"{_NUM_WORDS_DE.get(ones, str(ones))}und{tens_word}" + return str(n) + + +def _time_range_to_words(m): + """'8:00-9:00 Uhr' → 'acht bis neun Uhr', '8-9 Uhr' → 'acht bis neun Uhr'.""" + h1 = int(m.group(1)) + h2 = int(m.group(3)) + return f"{_num_to_words_de(h1)} bis {_num_to_words_de(h2)} Uhr" + + +def _small_range_to_words(m): + """'5-6' → 'fuenf bis sechs' (nur wenn beide Zahlen ≤ 24).""" + a, b = int(m.group(1)), int(m.group(2)) + if a > 24 or b > 24 or a >= b: + return m.group(0) + return f"{_num_to_words_de(a)} bis {_num_to_words_de(b)}" + + _UNIT_WORDS = [ (r'\bTB\b', 'Terabyte'), (r'\bGB\b', 'Gigabyte'), @@ -215,6 +255,22 @@ def clean_text_for_tts(text: str) -> str: t = _re_tts.sub(r'^>\s*', '', t, flags=_re_tts.MULTILINE) t = _re_tts.sub(r'^[\-\*]\s+', '', t, flags=_re_tts.MULTILINE) + # Zeitbereiche: "8:00-9:00 Uhr" / "8-9 Uhr" → "acht bis neun Uhr" + t = _re_tts.sub(r'\b(\d{1,2})(:\d{2})?\s*[-–]\s*(\d{1,2})(:\d{2})?\s*Uhr\b', _time_range_to_words, t) + # Uhrzeiten mit Minuten: "8:30 Uhr" → "acht Uhr dreissig", "8:00 Uhr" → "acht Uhr" + def _single_time(m): + h = int(m.group(1)) + mn = int(m.group(2)) if m.group(2) else 0 + words = _num_to_words_de(h) + " Uhr" + if mn > 0: + words += " " + _num_to_words_de(mn) + return words + t = _re_tts.sub(r'\b(\d{1,2}):(\d{2})\s*Uhr\b', _single_time, t) + # Volle Uhrzeiten ohne ":" — "15 Uhr" → "fuenfzehn Uhr" + t = _re_tts.sub(r'\b(\d{1,2})\s+Uhr\b', lambda m: f"{_num_to_words_de(int(m.group(1)))} Uhr", t) + # Kleine Zahlen-Bereiche ohne "Uhr": "5-6" → "fuenf bis sechs" + t = _re_tts.sub(r'\b(\d{1,2})\s*[-–]\s*(\d{1,2})\b', _small_range_to_words, t) + # Zahlen + Einheit: "22GB" → "22 Gigabyte" (Leerzeichen einfuegen) t = _re_tts.sub(r'(\d+)([A-Za-z]{1,4})\b', r'\1 \2', t) @@ -655,6 +711,8 @@ class ARIABridge: # Zeitstempel des letzten chat:final — waehrend 3s danach werden # trailing Agent-Events unterdrueckt (Core raeumt manchmal nach). self._last_chat_final_at: float = 0.0 + # requestId → messageId Map fuer XTTS-Audio-Cache (App-seitige Zuordnung) + self._xtts_request_to_message: dict[str, str] = {} def initialize(self) -> None: """Initialisiert alle Komponenten. @@ -998,6 +1056,9 @@ class ARIABridge: # Eindeutige Message-ID fuer Audio-Cache-Zuordnung message_id = str(uuid.uuid4()) + # TTS-aufbereitete Variante fuer Debug (Diagnostic zeigt optional) + tts_text_preview = clean_text_for_tts(text) + # Antwort an die App weiterleiten (als Chat-Nachricht) await self._send_to_rvs({ "type": "chat", @@ -1006,6 +1067,8 @@ class ARIABridge: "sender": "aria", "voice": voice_name, "messageId": message_id, + # Debug: aufbereiteter Text fuer TTS (App ignoriert, Diagnostic zeigt optional) + "ttsText": tts_text_preview if tts_text_preview != text else "", }, "timestamp": int(asyncio.get_event_loop().time() * 1000), }) @@ -1022,13 +1085,20 @@ class ARIABridge: logger.info("[core] TTS-Text leer nach Cleanup — XTTS uebersprungen") return try: + xtts_request_id = str(uuid.uuid4()) + # Map fuer xtts_response → App-Cache Zuordnung + self._xtts_request_to_message[xtts_request_id] = message_id + if len(self._xtts_request_to_message) > 100: + # Oldest entry raus damit der Dict nicht waechst + oldest = next(iter(self._xtts_request_to_message)) + self._xtts_request_to_message.pop(oldest, None) await self._send_to_rvs({ "type": "xtts_request", "payload": { "text": tts_text, "voice": xtts_voice, "language": "de", - "requestId": str(uuid.uuid4()), + "requestId": xtts_request_id, }, "timestamp": int(asyncio.get_event_loop().time() * 1000), }) @@ -1230,6 +1300,10 @@ class ARIABridge: # XTTS-Audio vom Gaming-PC empfangen → an App weiterleiten audio_b64 = payload.get("base64", "") error = payload.get("error", "") + req_id_full = payload.get("requestId", "") + # XTTS-Bridge suffixt chunkweise: "uuid_0", "uuid_1" → Basis-UUID extrahieren + req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full + linked_message_id = self._xtts_request_to_message.get(req_id_base, "") if error: logger.warning("[rvs] XTTS Fehler: %s", error) return @@ -1241,16 +1315,44 @@ class ARIABridge: "base64": audio_b64, "mimeType": payload.get("mimeType", "audio/wav"), "voice": payload.get("voice", "xtts"), + "messageId": linked_message_id, }, "timestamp": int(asyncio.get_event_loop().time() * 1000), }) return elif msg_type == "tts_request": - # App fordert TTS-Audio fuer einen Text an (Play-Button) + # App fordert TTS-Audio fuer einen Text an (Play-Button). + # Nutze die aktuell konfigurierte Engine (Piper oder XTTS). text = payload.get("text", "") requested_voice = payload.get("voice", "") - if text: + message_id = payload.get("messageId", "") # fuer Cache-Zuordnung + if not text: + return + + tts_engine = getattr(self, 'tts_engine_type', 'piper') + tts_text = clean_text_for_tts(text) or text + + if tts_engine == "xtts": + xtts_voice = getattr(self, 'xtts_voice', '') + try: + await self._send_to_rvs({ + "type": "xtts_request", + "payload": { + "text": tts_text, + "voice": xtts_voice, + "language": "de", + "requestId": str(uuid.uuid4()), + "messageId": message_id, + }, + "timestamp": int(asyncio.get_event_loop().time() * 1000), + }) + logger.info("[rvs] TTS on-demand via XTTS: '%s'", tts_text[:60]) + except Exception as e: + logger.warning("[rvs] XTTS-Request fehlgeschlagen, Fallback Piper: %s", e) + tts_engine = "piper" + + if tts_engine == "piper": voice_name = requested_voice or self.voice_engine.select_voice(text) audio_data = self.voice_engine.synthesize(text, voice_name) if audio_data: @@ -1262,10 +1364,11 @@ class ARIABridge: "base64": audio_b64, "mimeType": "audio/wav", "voice": voice_name, + "messageId": message_id, }, "timestamp": int(asyncio.get_event_loop().time() * 1000), }) - logger.info("[rvs] TTS on-demand: %d bytes (%s)", len(audio_data), voice_name) + logger.info("[rvs] TTS on-demand via Piper: %d bytes (%s)", len(audio_data), voice_name) except Exception as e: logger.warning("[rvs] TTS on-demand senden fehlgeschlagen: %s", e) return diff --git a/diagnostic/index.html b/diagnostic/index.html index 9affa49..3e0f9ec 100644 --- a/diagnostic/index.html +++ b/diagnostic/index.html @@ -198,7 +198,13 @@

Chat Test

- +
+ + +