217 lines
6.4 KiB
Python
217 lines
6.4 KiB
Python
"""
|
|
Claude's Eyes - Speech-to-Text Engine
|
|
|
|
Converts Stefan's speech to text for Claude
|
|
"""
|
|
|
|
import logging
|
|
import threading
|
|
import queue
|
|
from typing import Optional, Callable
|
|
from dataclasses import dataclass
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class SpeechResult:
|
|
"""Result from speech recognition"""
|
|
text: str
|
|
confidence: float
|
|
is_final: bool
|
|
|
|
|
|
class STTEngine:
|
|
"""Speech-to-Text engine using SpeechRecognition library"""
|
|
|
|
def __init__(
|
|
self,
|
|
energy_threshold: int = 300,
|
|
pause_threshold: float = 0.8,
|
|
phrase_time_limit: int = 15,
|
|
service: str = "google",
|
|
language: str = "de-DE"
|
|
):
|
|
import speech_recognition as sr
|
|
|
|
self.recognizer = sr.Recognizer()
|
|
self.microphone = sr.Microphone()
|
|
|
|
# Configure recognizer
|
|
self.recognizer.energy_threshold = energy_threshold
|
|
self.recognizer.pause_threshold = pause_threshold
|
|
self.recognizer.phrase_time_limit = phrase_time_limit
|
|
|
|
self.service = service
|
|
self.language = language
|
|
|
|
self._listening = False
|
|
self._callback: Optional[Callable[[SpeechResult], None]] = None
|
|
self._stop_flag = False
|
|
self._thread: Optional[threading.Thread] = None
|
|
self._results_queue = queue.Queue()
|
|
|
|
# Calibrate microphone
|
|
logger.info("Calibrating microphone...")
|
|
with self.microphone as source:
|
|
self.recognizer.adjust_for_ambient_noise(source, duration=1)
|
|
logger.info(f"Energy threshold set to {self.recognizer.energy_threshold}")
|
|
|
|
logger.info(f"STT engine initialized (service: {service}, language: {language})")
|
|
|
|
def listen_once(self, timeout: Optional[float] = None) -> Optional[SpeechResult]:
|
|
"""
|
|
Listen for a single phrase (blocking)
|
|
|
|
Args:
|
|
timeout: Maximum time to wait for speech start
|
|
|
|
Returns:
|
|
SpeechResult or None if nothing recognized
|
|
"""
|
|
import speech_recognition as sr
|
|
|
|
try:
|
|
with self.microphone as source:
|
|
logger.debug("Listening...")
|
|
audio = self.recognizer.listen(source, timeout=timeout)
|
|
|
|
return self._recognize(audio)
|
|
|
|
except sr.WaitTimeoutError:
|
|
logger.debug("Listen timeout")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Listen error: {e}")
|
|
return None
|
|
|
|
def _recognize(self, audio) -> Optional[SpeechResult]:
|
|
"""Recognize speech from audio data"""
|
|
import speech_recognition as sr
|
|
|
|
try:
|
|
if self.service == "google":
|
|
text = self.recognizer.recognize_google(audio, language=self.language)
|
|
return SpeechResult(text=text, confidence=0.9, is_final=True)
|
|
|
|
elif self.service == "sphinx":
|
|
# Offline recognition (needs pocketsphinx)
|
|
text = self.recognizer.recognize_sphinx(audio)
|
|
return SpeechResult(text=text, confidence=0.7, is_final=True)
|
|
|
|
else:
|
|
logger.error(f"Unknown service: {self.service}")
|
|
return None
|
|
|
|
except sr.UnknownValueError:
|
|
logger.debug("Could not understand audio")
|
|
return None
|
|
except sr.RequestError as e:
|
|
logger.error(f"Recognition service error: {e}")
|
|
return None
|
|
|
|
def start_continuous(self, callback: Callable[[SpeechResult], None]) -> None:
|
|
"""
|
|
Start continuous listening in background
|
|
|
|
Args:
|
|
callback: Function called with each recognized phrase
|
|
"""
|
|
if self._listening:
|
|
logger.warning("Already listening")
|
|
return
|
|
|
|
self._callback = callback
|
|
self._stop_flag = False
|
|
self._listening = True
|
|
|
|
self._thread = threading.Thread(target=self._listen_loop, daemon=True)
|
|
self._thread.start()
|
|
|
|
logger.info("Continuous listening started")
|
|
|
|
def stop_continuous(self) -> None:
|
|
"""Stop continuous listening"""
|
|
self._stop_flag = True
|
|
self._listening = False
|
|
|
|
if self._thread:
|
|
self._thread.join(timeout=2)
|
|
self._thread = None
|
|
|
|
logger.info("Continuous listening stopped")
|
|
|
|
def _listen_loop(self):
|
|
"""Background thread for continuous listening"""
|
|
import speech_recognition as sr
|
|
|
|
while not self._stop_flag:
|
|
try:
|
|
with self.microphone as source:
|
|
# Short timeout to allow stop checks
|
|
try:
|
|
audio = self.recognizer.listen(source, timeout=1, phrase_time_limit=self.recognizer.phrase_time_limit)
|
|
except sr.WaitTimeoutError:
|
|
continue
|
|
|
|
result = self._recognize(audio)
|
|
if result and self._callback:
|
|
self._callback(result)
|
|
|
|
except Exception as e:
|
|
if not self._stop_flag:
|
|
logger.error(f"Listen loop error: {e}")
|
|
|
|
def is_listening(self) -> bool:
|
|
return self._listening
|
|
|
|
def get_result_nonblocking(self) -> Optional[SpeechResult]:
|
|
"""Get result without blocking (for use with async callback)"""
|
|
try:
|
|
return self._results_queue.get_nowait()
|
|
except queue.Empty:
|
|
return None
|
|
|
|
|
|
def create_stt_engine(**kwargs) -> STTEngine:
|
|
"""Factory function to create STT engine"""
|
|
return STTEngine(
|
|
energy_threshold=kwargs.get("energy_threshold", 300),
|
|
pause_threshold=kwargs.get("pause_threshold", 0.8),
|
|
phrase_time_limit=kwargs.get("phrase_time_limit", 15),
|
|
service=kwargs.get("service", "google"),
|
|
language=kwargs.get("language", "de-DE")
|
|
)
|
|
|
|
|
|
# Test when run directly
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
print("Speech-to-Text Test")
|
|
print("=" * 40)
|
|
|
|
engine = create_stt_engine(language="de-DE")
|
|
|
|
print("\nSag etwas (du hast 10 Sekunden)...")
|
|
result = engine.listen_once(timeout=10)
|
|
|
|
if result:
|
|
print(f"\nErkannt: '{result.text}'")
|
|
print(f"Konfidenz: {result.confidence:.0%}")
|
|
else:
|
|
print("\nNichts erkannt.")
|
|
|
|
print("\n\nKontinuierlicher Modus (5 Sekunden)...")
|
|
|
|
def on_speech(result: SpeechResult):
|
|
print(f" -> {result.text}")
|
|
|
|
engine.start_continuous(on_speech)
|
|
|
|
import time
|
|
time.sleep(5)
|
|
|
|
engine.stop_continuous()
|
|
print("\nDone!")
|