esp32-claude-robbie/claudes_eyes/python_bridge/stt_engine.py

217 lines
6.4 KiB
Python

"""
Claude's Eyes - Speech-to-Text Engine
Converts Stefan's speech to text for Claude
"""
import logging
import threading
import queue
from typing import Optional, Callable
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class SpeechResult:
"""Result from speech recognition"""
text: str
confidence: float
is_final: bool
class STTEngine:
"""Speech-to-Text engine using SpeechRecognition library"""
def __init__(
self,
energy_threshold: int = 300,
pause_threshold: float = 0.8,
phrase_time_limit: int = 15,
service: str = "google",
language: str = "de-DE"
):
import speech_recognition as sr
self.recognizer = sr.Recognizer()
self.microphone = sr.Microphone()
# Configure recognizer
self.recognizer.energy_threshold = energy_threshold
self.recognizer.pause_threshold = pause_threshold
self.recognizer.phrase_time_limit = phrase_time_limit
self.service = service
self.language = language
self._listening = False
self._callback: Optional[Callable[[SpeechResult], None]] = None
self._stop_flag = False
self._thread: Optional[threading.Thread] = None
self._results_queue = queue.Queue()
# Calibrate microphone
logger.info("Calibrating microphone...")
with self.microphone as source:
self.recognizer.adjust_for_ambient_noise(source, duration=1)
logger.info(f"Energy threshold set to {self.recognizer.energy_threshold}")
logger.info(f"STT engine initialized (service: {service}, language: {language})")
def listen_once(self, timeout: Optional[float] = None) -> Optional[SpeechResult]:
"""
Listen for a single phrase (blocking)
Args:
timeout: Maximum time to wait for speech start
Returns:
SpeechResult or None if nothing recognized
"""
import speech_recognition as sr
try:
with self.microphone as source:
logger.debug("Listening...")
audio = self.recognizer.listen(source, timeout=timeout)
return self._recognize(audio)
except sr.WaitTimeoutError:
logger.debug("Listen timeout")
return None
except Exception as e:
logger.error(f"Listen error: {e}")
return None
def _recognize(self, audio) -> Optional[SpeechResult]:
"""Recognize speech from audio data"""
import speech_recognition as sr
try:
if self.service == "google":
text = self.recognizer.recognize_google(audio, language=self.language)
return SpeechResult(text=text, confidence=0.9, is_final=True)
elif self.service == "sphinx":
# Offline recognition (needs pocketsphinx)
text = self.recognizer.recognize_sphinx(audio)
return SpeechResult(text=text, confidence=0.7, is_final=True)
else:
logger.error(f"Unknown service: {self.service}")
return None
except sr.UnknownValueError:
logger.debug("Could not understand audio")
return None
except sr.RequestError as e:
logger.error(f"Recognition service error: {e}")
return None
def start_continuous(self, callback: Callable[[SpeechResult], None]) -> None:
"""
Start continuous listening in background
Args:
callback: Function called with each recognized phrase
"""
if self._listening:
logger.warning("Already listening")
return
self._callback = callback
self._stop_flag = False
self._listening = True
self._thread = threading.Thread(target=self._listen_loop, daemon=True)
self._thread.start()
logger.info("Continuous listening started")
def stop_continuous(self) -> None:
"""Stop continuous listening"""
self._stop_flag = True
self._listening = False
if self._thread:
self._thread.join(timeout=2)
self._thread = None
logger.info("Continuous listening stopped")
def _listen_loop(self):
"""Background thread for continuous listening"""
import speech_recognition as sr
while not self._stop_flag:
try:
with self.microphone as source:
# Short timeout to allow stop checks
try:
audio = self.recognizer.listen(source, timeout=1, phrase_time_limit=self.recognizer.phrase_time_limit)
except sr.WaitTimeoutError:
continue
result = self._recognize(audio)
if result and self._callback:
self._callback(result)
except Exception as e:
if not self._stop_flag:
logger.error(f"Listen loop error: {e}")
def is_listening(self) -> bool:
return self._listening
def get_result_nonblocking(self) -> Optional[SpeechResult]:
"""Get result without blocking (for use with async callback)"""
try:
return self._results_queue.get_nowait()
except queue.Empty:
return None
def create_stt_engine(**kwargs) -> STTEngine:
"""Factory function to create STT engine"""
return STTEngine(
energy_threshold=kwargs.get("energy_threshold", 300),
pause_threshold=kwargs.get("pause_threshold", 0.8),
phrase_time_limit=kwargs.get("phrase_time_limit", 15),
service=kwargs.get("service", "google"),
language=kwargs.get("language", "de-DE")
)
# Test when run directly
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
print("Speech-to-Text Test")
print("=" * 40)
engine = create_stt_engine(language="de-DE")
print("\nSag etwas (du hast 10 Sekunden)...")
result = engine.listen_once(timeout=10)
if result:
print(f"\nErkannt: '{result.text}'")
print(f"Konfidenz: {result.confidence:.0%}")
else:
print("\nNichts erkannt.")
print("\n\nKontinuierlicher Modus (5 Sekunden)...")
def on_speech(result: SpeechResult):
print(f" -> {result.text}")
engine.start_continuous(on_speech)
import time
time.sleep(5)
engine.stop_continuous()
print("\nDone!")