esp32-claude-robbie/python_bridge/stt_engine.py

350 lines
10 KiB
Python

"""
Claude's Eyes - Speech-to-Text Engine
Converts Stefan's speech to text for Claude
"""
import logging
import threading
import queue
from typing import Optional, Callable
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class SpeechResult:
"""Result from speech recognition"""
text: str
confidence: float
is_final: bool
class STTEngine:
"""Speech-to-Text engine using SpeechRecognition library"""
def __init__(
self,
energy_threshold: int = 300,
pause_threshold: float = 0.8,
phrase_time_limit: int = 15,
service: str = "google",
language: str = "de-DE"
):
import speech_recognition as sr
self.recognizer = sr.Recognizer()
self.microphone = sr.Microphone()
# Configure recognizer
self.recognizer.energy_threshold = energy_threshold
self.recognizer.pause_threshold = pause_threshold
self.recognizer.phrase_time_limit = phrase_time_limit
self.service = service
self.language = language
self._listening = False
self._callback: Optional[Callable[[SpeechResult], None]] = None
self._stop_flag = False
self._thread: Optional[threading.Thread] = None
self._results_queue = queue.Queue()
# Calibrate microphone
logger.info("Calibrating microphone...")
with self.microphone as source:
self.recognizer.adjust_for_ambient_noise(source, duration=1)
logger.info(f"Energy threshold set to {self.recognizer.energy_threshold}")
logger.info(f"STT engine initialized (service: {service}, language: {language})")
def listen_once(self, timeout: Optional[float] = None) -> Optional[SpeechResult]:
"""
Listen for a single phrase (blocking)
Args:
timeout: Maximum time to wait for speech start
Returns:
SpeechResult or None if nothing recognized
"""
import speech_recognition as sr
try:
with self.microphone as source:
logger.debug("Listening...")
audio = self.recognizer.listen(source, timeout=timeout)
return self._recognize(audio)
except sr.WaitTimeoutError:
logger.debug("Listen timeout")
return None
except Exception as e:
logger.error(f"Listen error: {e}")
return None
def _recognize(self, audio) -> Optional[SpeechResult]:
"""Recognize speech from audio data"""
import speech_recognition as sr
try:
if self.service == "google":
text = self.recognizer.recognize_google(audio, language=self.language)
return SpeechResult(text=text, confidence=0.9, is_final=True)
elif self.service == "sphinx":
# Offline recognition (needs pocketsphinx)
text = self.recognizer.recognize_sphinx(audio)
return SpeechResult(text=text, confidence=0.7, is_final=True)
else:
logger.error(f"Unknown service: {self.service}")
return None
except sr.UnknownValueError:
logger.debug("Could not understand audio")
return None
except sr.RequestError as e:
logger.error(f"Recognition service error: {e}")
return None
def start_continuous(self, callback: Callable[[SpeechResult], None]) -> None:
"""
Start continuous listening in background
Args:
callback: Function called with each recognized phrase
"""
if self._listening:
logger.warning("Already listening")
return
self._callback = callback
self._stop_flag = False
self._listening = True
self._thread = threading.Thread(target=self._listen_loop, daemon=True)
self._thread.start()
logger.info("Continuous listening started")
def stop_continuous(self) -> None:
"""Stop continuous listening"""
self._stop_flag = True
self._listening = False
if self._thread:
self._thread.join(timeout=2)
self._thread = None
logger.info("Continuous listening stopped")
def _listen_loop(self):
"""Background thread for continuous listening"""
import speech_recognition as sr
while not self._stop_flag:
try:
with self.microphone as source:
# Short timeout to allow stop checks
try:
audio = self.recognizer.listen(source, timeout=1, phrase_time_limit=self.recognizer.phrase_time_limit)
except sr.WaitTimeoutError:
continue
result = self._recognize(audio)
if result and self._callback:
self._callback(result)
except Exception as e:
if not self._stop_flag:
logger.error(f"Listen loop error: {e}")
def is_listening(self) -> bool:
return self._listening
def get_result_nonblocking(self) -> Optional[SpeechResult]:
"""Get result without blocking (for use with async callback)"""
try:
return self._results_queue.get_nowait()
except queue.Empty:
return None
class TermuxSTTEngine:
"""
STT via Termux:API für Android
Benötigt:
- Termux App
- Termux:API App
- pkg install termux-api
"""
def __init__(self, language: str = "de-DE", timeout: int = 10):
self.language = language
self.timeout = timeout
self._listening = False
self._stop_flag = False
self._thread: Optional[threading.Thread] = None
self._callback: Optional[Callable[[SpeechResult], None]] = None
# Teste ob termux-speech-to-text verfügbar ist
import shutil
if not shutil.which("termux-speech-to-text"):
raise RuntimeError(
"termux-speech-to-text nicht gefunden! "
"Installiere mit: pkg install termux-api"
)
logger.info(f"Termux STT engine initialized (language: {language})")
def listen_once(self, timeout: Optional[float] = None) -> Optional[SpeechResult]:
"""
Listen for a single phrase via Termux API
Args:
timeout: Maximum time to wait (uses class timeout if None)
Returns:
SpeechResult or None if nothing recognized
"""
import subprocess
import json
actual_timeout = timeout if timeout else self.timeout
try:
# termux-speech-to-text gibt JSON zurück
result = subprocess.run(
["termux-speech-to-text"],
capture_output=True,
text=True,
timeout=actual_timeout + 5 # Extra Zeit für API
)
if result.returncode != 0:
logger.error(f"Termux STT error: {result.stderr}")
return None
# Output ist ein String (kein JSON bei Termux)
text = result.stdout.strip()
if text:
return SpeechResult(
text=text,
confidence=0.8, # Termux gibt keine Konfidenz
is_final=True
)
return None
except subprocess.TimeoutExpired:
logger.debug("Termux STT timeout")
return None
except Exception as e:
logger.error(f"Termux STT error: {e}")
return None
def start_continuous(self, callback: Callable[[SpeechResult], None]) -> None:
"""Start continuous listening in background"""
if self._listening:
logger.warning("Already listening")
return
self._callback = callback
self._stop_flag = False
self._listening = True
self._thread = threading.Thread(target=self._listen_loop, daemon=True)
self._thread.start()
logger.info("Termux continuous listening started")
def stop_continuous(self) -> None:
"""Stop continuous listening"""
self._stop_flag = True
self._listening = False
if self._thread:
self._thread.join(timeout=2)
self._thread = None
logger.info("Termux continuous listening stopped")
def _listen_loop(self):
"""Background thread for continuous listening"""
while not self._stop_flag:
try:
result = self.listen_once(timeout=5)
if result and self._callback:
self._callback(result)
except Exception as e:
if not self._stop_flag:
logger.error(f"Termux listen loop error: {e}")
# Kleine Pause zwischen Aufnahmen
import time
time.sleep(0.5)
def is_listening(self) -> bool:
return self._listening
def create_stt_engine(engine_type: str = "standard", **kwargs):
"""
Factory function to create STT engine
Args:
engine_type: "standard" or "termux"
**kwargs: Engine-specific options
"""
if engine_type == "termux":
return TermuxSTTEngine(
language=kwargs.get("language", "de-DE"),
timeout=kwargs.get("phrase_time_limit", 15)
)
else:
# Standard SpeechRecognition engine
return STTEngine(
energy_threshold=kwargs.get("energy_threshold", 300),
pause_threshold=kwargs.get("pause_threshold", 0.8),
phrase_time_limit=kwargs.get("phrase_time_limit", 15),
service=kwargs.get("service", "google"),
language=kwargs.get("language", "de-DE")
)
# Test when run directly
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
print("Speech-to-Text Test")
print("=" * 40)
engine = create_stt_engine(language="de-DE")
print("\nSag etwas (du hast 10 Sekunden)...")
result = engine.listen_once(timeout=10)
if result:
print(f"\nErkannt: '{result.text}'")
print(f"Konfidenz: {result.confidence:.0%}")
else:
print("\nNichts erkannt.")
print("\n\nKontinuierlicher Modus (5 Sekunden)...")
def on_speech(result: SpeechResult):
print(f" -> {result.text}")
engine.start_continuous(on_speech)
import time
time.sleep(5)
engine.stop_continuous()
print("\nDone!")