esp32-claude-robbie/python_bridge/tts_engine.py

337 lines
9.4 KiB
Python

"""
Claude's Eyes - Text-to-Speech Engine
Converts Claude's text responses to spoken audio
"""
import logging
import threading
import queue
from typing import Optional
from abc import ABC, abstractmethod
logger = logging.getLogger(__name__)
class TTSEngine(ABC):
"""Abstract base class for TTS engines"""
@abstractmethod
def speak(self, text: str) -> None:
"""Speak the given text (blocking)"""
pass
@abstractmethod
def speak_async(self, text: str) -> None:
"""Speak the given text (non-blocking)"""
pass
@abstractmethod
def stop(self) -> None:
"""Stop current speech"""
pass
@abstractmethod
def is_speaking(self) -> bool:
"""Check if currently speaking"""
pass
class Pyttsx3Engine(TTSEngine):
"""TTS using pyttsx3 (offline, system voices)"""
def __init__(self, voice: Optional[str] = None, rate: int = 150, volume: float = 0.9):
import pyttsx3
self.engine = pyttsx3.init()
self.engine.setProperty('rate', rate)
self.engine.setProperty('volume', volume)
# Set voice if specified
if voice:
voices = self.engine.getProperty('voices')
for v in voices:
if voice.lower() in v.name.lower():
self.engine.setProperty('voice', v.id)
break
self._speaking = False
self._queue = queue.Queue()
self._thread: Optional[threading.Thread] = None
self._stop_flag = False
logger.info("Pyttsx3 TTS engine initialized")
def speak(self, text: str) -> None:
"""Speak text (blocking)"""
self._speaking = True
try:
self.engine.say(text)
self.engine.runAndWait()
finally:
self._speaking = False
def speak_async(self, text: str) -> None:
"""Speak text (non-blocking)"""
self._queue.put(text)
if self._thread is None or not self._thread.is_alive():
self._stop_flag = False
self._thread = threading.Thread(target=self._speech_worker, daemon=True)
self._thread.start()
def _speech_worker(self):
"""Worker thread for async speech"""
while not self._stop_flag:
try:
text = self._queue.get(timeout=0.5)
self.speak(text)
self._queue.task_done()
except queue.Empty:
continue
def stop(self) -> None:
"""Stop current speech"""
self._stop_flag = True
self.engine.stop()
# Clear queue
while not self._queue.empty():
try:
self._queue.get_nowait()
except queue.Empty:
break
def is_speaking(self) -> bool:
return self._speaking
class GTTSEngine(TTSEngine):
"""TTS using Google Text-to-Speech (online, better quality)"""
def __init__(self, language: str = "de"):
from gtts import gTTS
import pygame
pygame.mixer.init()
self.language = language
self._speaking = False
self._queue = queue.Queue()
self._thread: Optional[threading.Thread] = None
self._stop_flag = False
logger.info(f"gTTS engine initialized (language: {language})")
def speak(self, text: str) -> None:
"""Speak text (blocking)"""
from gtts import gTTS
import pygame
import tempfile
import os
self._speaking = True
try:
# Generate audio file
tts = gTTS(text=text, lang=self.language)
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
temp_path = f.name
tts.save(temp_path)
# Play audio
pygame.mixer.music.load(temp_path)
pygame.mixer.music.play()
# Wait for playback to finish
while pygame.mixer.music.get_busy():
pygame.time.Clock().tick(10)
# Cleanup
os.unlink(temp_path)
except Exception as e:
logger.error(f"gTTS error: {e}")
finally:
self._speaking = False
def speak_async(self, text: str) -> None:
"""Speak text (non-blocking)"""
self._queue.put(text)
if self._thread is None or not self._thread.is_alive():
self._stop_flag = False
self._thread = threading.Thread(target=self._speech_worker, daemon=True)
self._thread.start()
def _speech_worker(self):
"""Worker thread for async speech"""
while not self._stop_flag:
try:
text = self._queue.get(timeout=0.5)
self.speak(text)
self._queue.task_done()
except queue.Empty:
continue
def stop(self) -> None:
"""Stop current speech"""
import pygame
self._stop_flag = True
pygame.mixer.music.stop()
# Clear queue
while not self._queue.empty():
try:
self._queue.get_nowait()
except queue.Empty:
break
def is_speaking(self) -> bool:
return self._speaking
class TermuxTTSEngine(TTSEngine):
"""
TTS via Termux:API für Android
Benötigt:
- Termux App
- Termux:API App
- pkg install termux-api
"""
def __init__(self, language: str = "de", rate: float = 1.0):
self.language = language
self.rate = rate
self._speaking = False
self._queue = queue.Queue()
self._thread: Optional[threading.Thread] = None
self._stop_flag = False
self._process = None
# Teste ob termux-tts-speak verfügbar ist
import shutil
if not shutil.which("termux-tts-speak"):
raise RuntimeError(
"termux-tts-speak nicht gefunden! "
"Installiere mit: pkg install termux-api"
)
logger.info(f"Termux TTS engine initialized (language: {language})")
def speak(self, text: str) -> None:
"""Speak text via Termux API (blocking)"""
import subprocess
self._speaking = True
try:
# termux-tts-speak Optionen:
# -l <language> - Sprache (z.B. "de" oder "de-DE")
# -r <rate> - Geschwindigkeit (0.5 bis 2.0, default 1.0)
# -p <pitch> - Tonhöhe (0.5 bis 2.0, default 1.0)
# -s <stream> - Audio Stream (ALARM, MUSIC, NOTIFICATION, RING, SYSTEM, VOICE_CALL)
cmd = [
"termux-tts-speak",
"-l", self.language,
"-r", str(self.rate),
text
]
self._process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
self._process.wait() # Warte bis fertig
self._process = None
except Exception as e:
logger.error(f"Termux TTS error: {e}")
finally:
self._speaking = False
def speak_async(self, text: str) -> None:
"""Speak text (non-blocking)"""
self._queue.put(text)
if self._thread is None or not self._thread.is_alive():
self._stop_flag = False
self._thread = threading.Thread(target=self._speech_worker, daemon=True)
self._thread.start()
def _speech_worker(self):
"""Worker thread for async speech"""
while not self._stop_flag:
try:
text = self._queue.get(timeout=0.5)
self.speak(text)
self._queue.task_done()
except queue.Empty:
continue
def stop(self) -> None:
"""Stop current speech"""
self._stop_flag = True
# Beende laufenden Prozess
if self._process:
try:
self._process.terminate()
except:
pass
# Clear queue
while not self._queue.empty():
try:
self._queue.get_nowait()
except queue.Empty:
break
def is_speaking(self) -> bool:
return self._speaking
def create_tts_engine(engine_type: str = "pyttsx3", **kwargs) -> TTSEngine:
"""
Factory function to create TTS engine
Args:
engine_type: "pyttsx3", "gtts", or "termux"
**kwargs: Engine-specific options
"""
if engine_type == "pyttsx3":
return Pyttsx3Engine(
voice=kwargs.get("voice"),
rate=kwargs.get("rate", 150),
volume=kwargs.get("volume", 0.9)
)
elif engine_type == "gtts":
return GTTSEngine(
language=kwargs.get("language", "de")
)
elif engine_type == "termux":
return TermuxTTSEngine(
language=kwargs.get("language", "de"),
rate=kwargs.get("rate", 1.0)
)
else:
raise ValueError(f"Unknown TTS engine: {engine_type}")
# Test when run directly
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
print("Testing pyttsx3...")
engine = create_tts_engine("pyttsx3", rate=150)
engine.speak("Hallo! Ich bin Claude und erkunde gerade deine Wohnung.")
print("\nTesting gTTS...")
try:
engine2 = create_tts_engine("gtts", language="de")
engine2.speak("Das hier klingt noch besser!")
except Exception as e:
print(f"gTTS not available: {e}")
print("\nDone!")