init version
This commit is contained in:
@@ -0,0 +1,424 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Claude's Eyes - Main Bridge Script
|
||||
|
||||
Connects the ESP32 robot with Claude AI for autonomous exploration.
|
||||
|
||||
Usage:
|
||||
python bridge.py # Use config.yaml
|
||||
python bridge.py --config my.yaml # Use custom config
|
||||
python bridge.py --simulate # Simulate without hardware
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
import signal
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
import yaml
|
||||
import click
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.live import Live
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from esp32_client import ESP32Client, RobotStatus
|
||||
from tts_engine import create_tts_engine, TTSEngine
|
||||
from stt_engine import create_stt_engine, STTEngine, SpeechResult
|
||||
from chat_interface import create_chat_interface, ChatInterface, ChatResponse
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Rich console for pretty output
|
||||
console = Console()
|
||||
|
||||
|
||||
@dataclass
|
||||
class BridgeState:
|
||||
"""Current state of the bridge"""
|
||||
connected: bool = False
|
||||
exploring: bool = False
|
||||
last_image_time: float = 0
|
||||
last_status: Optional[RobotStatus] = None
|
||||
last_claude_response: str = ""
|
||||
stefan_input: str = ""
|
||||
error_message: str = ""
|
||||
|
||||
|
||||
class ClaudesEyesBridge:
|
||||
"""Main bridge class connecting robot and Claude"""
|
||||
|
||||
def __init__(self, config_path: str, simulate: bool = False):
|
||||
self.config = self._load_config(config_path)
|
||||
self.simulate = simulate
|
||||
self.state = BridgeState()
|
||||
self.running = False
|
||||
|
||||
# Components
|
||||
self.robot: Optional[ESP32Client] = None
|
||||
self.chat: Optional[ChatInterface] = None
|
||||
self.tts: Optional[TTSEngine] = None
|
||||
self.stt: Optional[STTEngine] = None
|
||||
|
||||
# Threading
|
||||
self.speech_thread: Optional[threading.Thread] = None
|
||||
self._stop_event = threading.Event()
|
||||
|
||||
def _load_config(self, config_path: str) -> dict:
|
||||
"""Load configuration from YAML file"""
|
||||
path = Path(config_path)
|
||||
|
||||
# Try local config first
|
||||
local_path = path.parent / f"{path.stem}.local{path.suffix}"
|
||||
if local_path.exists():
|
||||
path = local_path
|
||||
logger.info(f"Using local config: {path}")
|
||||
|
||||
if not path.exists():
|
||||
logger.error(f"Config file not found: {path}")
|
||||
sys.exit(1)
|
||||
|
||||
with open(path) as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
return config
|
||||
|
||||
def initialize(self) -> bool:
|
||||
"""Initialize all components"""
|
||||
console.print(Panel.fit(
|
||||
"[bold cyan]Claude's Eyes[/bold cyan]\n"
|
||||
"[dim]Autonomous Exploration Robot[/dim]",
|
||||
border_style="cyan"
|
||||
))
|
||||
|
||||
# Initialize robot client
|
||||
if not self.simulate:
|
||||
console.print("\n[yellow]Connecting to robot...[/yellow]")
|
||||
esp_config = self.config.get("esp32", {})
|
||||
self.robot = ESP32Client(
|
||||
host=esp_config.get("host", "192.168.178.100"),
|
||||
port=esp_config.get("port", 80),
|
||||
api_key=esp_config.get("api_key", ""),
|
||||
timeout=esp_config.get("timeout", 10)
|
||||
)
|
||||
|
||||
if not self.robot.is_connected():
|
||||
console.print("[red]Could not connect to robot![/red]")
|
||||
self.state.error_message = "Robot connection failed"
|
||||
return False
|
||||
|
||||
self.state.connected = True
|
||||
console.print("[green]Robot connected![/green]")
|
||||
else:
|
||||
console.print("[yellow]Simulation mode - no robot connection[/yellow]")
|
||||
self.state.connected = True
|
||||
|
||||
# Initialize Claude interface
|
||||
console.print("\n[yellow]Initializing Claude interface...[/yellow]")
|
||||
claude_config = self.config.get("claude", {})
|
||||
|
||||
api_key = claude_config.get("api_key") or os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
|
||||
self.chat = create_chat_interface(
|
||||
use_api=claude_config.get("use_api", True) and bool(api_key),
|
||||
api_key=api_key,
|
||||
model=claude_config.get("model", "claude-sonnet-4-20250514"),
|
||||
system_prompt=claude_config.get("system_prompt", ""),
|
||||
max_tokens=claude_config.get("max_tokens", 1024)
|
||||
)
|
||||
console.print(f"[green]Chat interface ready ({type(self.chat).__name__})[/green]")
|
||||
|
||||
# Initialize TTS
|
||||
console.print("\n[yellow]Initializing Text-to-Speech...[/yellow]")
|
||||
tts_config = self.config.get("tts", {})
|
||||
try:
|
||||
self.tts = create_tts_engine(
|
||||
engine_type=tts_config.get("engine", "pyttsx3"),
|
||||
voice=tts_config.get("voice"),
|
||||
rate=tts_config.get("rate", 150),
|
||||
volume=tts_config.get("volume", 0.9),
|
||||
language=tts_config.get("language", "de")
|
||||
)
|
||||
console.print("[green]TTS ready![/green]")
|
||||
except Exception as e:
|
||||
console.print(f"[red]TTS init failed: {e}[/red]")
|
||||
self.tts = None
|
||||
|
||||
# Initialize STT
|
||||
console.print("\n[yellow]Initializing Speech-to-Text...[/yellow]")
|
||||
stt_config = self.config.get("stt", {})
|
||||
try:
|
||||
self.stt = create_stt_engine(
|
||||
energy_threshold=stt_config.get("energy_threshold", 300),
|
||||
pause_threshold=stt_config.get("pause_threshold", 0.8),
|
||||
phrase_time_limit=stt_config.get("phrase_time_limit", 15),
|
||||
service=stt_config.get("service", "google"),
|
||||
language=stt_config.get("language", "de-DE")
|
||||
)
|
||||
console.print("[green]STT ready![/green]")
|
||||
except Exception as e:
|
||||
console.print(f"[red]STT init failed: {e}[/red]")
|
||||
self.stt = None
|
||||
|
||||
console.print("\n[bold green]All systems initialized![/bold green]\n")
|
||||
return True
|
||||
|
||||
def start(self):
|
||||
"""Start the main exploration loop"""
|
||||
self.running = True
|
||||
self.state.exploring = True
|
||||
|
||||
# Start speech recognition in background
|
||||
if self.stt:
|
||||
self.stt.start_continuous(self._on_speech_detected)
|
||||
|
||||
# Welcome message
|
||||
welcome = "Hallo Stefan! Ich bin online und bereit zum Erkunden. Was soll ich mir anschauen?"
|
||||
self._speak(welcome)
|
||||
self.state.last_claude_response = welcome
|
||||
|
||||
try:
|
||||
self._main_loop()
|
||||
except KeyboardInterrupt:
|
||||
console.print("\n[yellow]Stopping...[/yellow]")
|
||||
finally:
|
||||
self.stop()
|
||||
|
||||
def stop(self):
|
||||
"""Stop the bridge"""
|
||||
self.running = False
|
||||
self.state.exploring = False
|
||||
self._stop_event.set()
|
||||
|
||||
if self.stt:
|
||||
self.stt.stop_continuous()
|
||||
|
||||
if self.tts:
|
||||
self.tts.stop()
|
||||
|
||||
if self.robot and not self.simulate:
|
||||
self.robot.stop()
|
||||
|
||||
console.print("[yellow]Bridge stopped[/yellow]")
|
||||
|
||||
def _main_loop(self):
|
||||
"""Main exploration loop"""
|
||||
camera_config = self.config.get("camera", {})
|
||||
capture_interval = camera_config.get("capture_interval", 5)
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
current_time = time.time()
|
||||
|
||||
# Capture and process image periodically
|
||||
if current_time - self.state.last_image_time >= capture_interval:
|
||||
self._exploration_step()
|
||||
self.state.last_image_time = current_time
|
||||
|
||||
# Update status display
|
||||
self._update_display()
|
||||
|
||||
# Small delay
|
||||
time.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Loop error: {e}")
|
||||
self.state.error_message = str(e)
|
||||
time.sleep(1)
|
||||
|
||||
def _exploration_step(self):
|
||||
"""Single exploration step: capture, analyze, act"""
|
||||
# Get robot status
|
||||
if self.robot and not self.simulate:
|
||||
try:
|
||||
self.state.last_status = self.robot.get_status()
|
||||
except Exception as e:
|
||||
logger.error(f"Status error: {e}")
|
||||
|
||||
# Capture image
|
||||
image_data = None
|
||||
if self.robot and not self.simulate:
|
||||
try:
|
||||
camera_config = self.config.get("camera", {})
|
||||
image_data = self.robot.capture_image(
|
||||
resolution=camera_config.get("resolution", "VGA"),
|
||||
quality=camera_config.get("quality", 12)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Capture error: {e}")
|
||||
|
||||
# Build context message
|
||||
context = self._build_context_message()
|
||||
|
||||
# Add Stefan's input if any
|
||||
if self.state.stefan_input:
|
||||
context += f"\n\nStefan sagt: {self.state.stefan_input}"
|
||||
self.state.stefan_input = ""
|
||||
|
||||
# Send to Claude
|
||||
try:
|
||||
response = self.chat.send_message(context, image=image_data)
|
||||
self.state.last_claude_response = response.text
|
||||
|
||||
# Speak response
|
||||
self._speak(response.text)
|
||||
|
||||
# Execute commands
|
||||
self._execute_commands(response.commands)
|
||||
|
||||
# Update robot display
|
||||
if self.robot and not self.simulate:
|
||||
# Send short version to robot display
|
||||
short_text = response.text[:100] + "..." if len(response.text) > 100 else response.text
|
||||
self.robot.set_claude_text(short_text)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Chat error: {e}")
|
||||
self.state.error_message = str(e)
|
||||
|
||||
def _build_context_message(self) -> str:
|
||||
"""Build context message with sensor data"""
|
||||
parts = ["Hier ist was ich gerade sehe und meine Sensordaten:"]
|
||||
|
||||
if self.state.last_status:
|
||||
status = self.state.last_status
|
||||
parts.append(f"\n- Abstand zum nächsten Hindernis: {status.distance_cm:.0f} cm")
|
||||
parts.append(f"- Aktuelle Aktion: {status.current_action}")
|
||||
parts.append(f"- Batterie: {status.battery_percent}%")
|
||||
|
||||
if status.obstacle_danger:
|
||||
parts.append("- WARNUNG: Hindernis sehr nah!")
|
||||
elif status.obstacle_warning:
|
||||
parts.append("- Hinweis: Hindernis in der Nähe")
|
||||
|
||||
if status.is_tilted:
|
||||
parts.append("- WARNUNG: Ich bin schief!")
|
||||
|
||||
parts.append("\nWas siehst du auf dem Bild? Was möchtest du als nächstes tun?")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
def _execute_commands(self, commands: list):
|
||||
"""Execute movement commands from Claude"""
|
||||
if not commands:
|
||||
return
|
||||
|
||||
if self.simulate:
|
||||
console.print(f"[dim]Simulated commands: {commands}[/dim]")
|
||||
return
|
||||
|
||||
if not self.robot:
|
||||
return
|
||||
|
||||
safety = self.config.get("safety", {})
|
||||
max_speed = safety.get("max_speed", 70)
|
||||
min_distance = safety.get("min_obstacle_distance", 20)
|
||||
|
||||
for cmd in commands:
|
||||
# Safety check
|
||||
if self.state.last_status and self.state.last_status.distance_cm < min_distance:
|
||||
if cmd == "FORWARD":
|
||||
console.print("[red]Blocked: Obstacle too close![/red]")
|
||||
continue
|
||||
|
||||
try:
|
||||
if cmd == "FORWARD":
|
||||
self.robot.forward(speed=max_speed, duration_ms=800)
|
||||
elif cmd == "BACKWARD":
|
||||
self.robot.backward(speed=max_speed, duration_ms=800)
|
||||
elif cmd == "LEFT":
|
||||
self.robot.left(speed=max_speed, duration_ms=400)
|
||||
elif cmd == "RIGHT":
|
||||
self.robot.right(speed=max_speed, duration_ms=400)
|
||||
elif cmd == "STOP":
|
||||
self.robot.stop()
|
||||
elif cmd == "LOOK_LEFT":
|
||||
self.robot.look_left()
|
||||
elif cmd == "LOOK_RIGHT":
|
||||
self.robot.look_right()
|
||||
elif cmd == "LOOK_UP":
|
||||
self.robot.look_up()
|
||||
elif cmd == "LOOK_DOWN":
|
||||
self.robot.look_down()
|
||||
elif cmd == "LOOK_CENTER":
|
||||
self.robot.look_center()
|
||||
|
||||
# Small delay between commands
|
||||
time.sleep(0.3)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Command error ({cmd}): {e}")
|
||||
|
||||
def _speak(self, text: str):
|
||||
"""Speak text using TTS"""
|
||||
if self.tts:
|
||||
# Remove command brackets from speech
|
||||
import re
|
||||
clean_text = re.sub(r'\[[A-Z_]+\]', '', text).strip()
|
||||
if clean_text:
|
||||
self.tts.speak_async(clean_text)
|
||||
|
||||
def _on_speech_detected(self, result: SpeechResult):
|
||||
"""Callback when Stefan says something"""
|
||||
console.print(f"\n[bold blue]Stefan:[/bold blue] {result.text}")
|
||||
self.state.stefan_input = result.text
|
||||
|
||||
def _update_display(self):
|
||||
"""Update console display"""
|
||||
# This could be enhanced with rich.live for real-time updates
|
||||
pass
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
"""Handle Ctrl+C gracefully"""
|
||||
console.print("\n[yellow]Received stop signal...[/yellow]")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--config', '-c', default='config.yaml', help='Path to config file')
|
||||
@click.option('--simulate', '-s', is_flag=True, help='Simulate without hardware')
|
||||
@click.option('--debug', '-d', is_flag=True, help='Enable debug logging')
|
||||
def main(config: str, simulate: bool, debug: bool):
|
||||
"""Claude's Eyes - Autonomous Exploration Robot Bridge"""
|
||||
|
||||
if debug:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
# Handle signals
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
# Find config file
|
||||
config_path = Path(config)
|
||||
if not config_path.is_absolute():
|
||||
# Look in script directory first
|
||||
script_dir = Path(__file__).parent
|
||||
if (script_dir / config).exists():
|
||||
config_path = script_dir / config
|
||||
|
||||
# Create and run bridge
|
||||
bridge = ClaudesEyesBridge(str(config_path), simulate=simulate)
|
||||
|
||||
if bridge.initialize():
|
||||
console.print("\n[bold cyan]Starting exploration...[/bold cyan]")
|
||||
console.print("[dim]Press Ctrl+C to stop[/dim]\n")
|
||||
bridge.start()
|
||||
else:
|
||||
console.print("[red]Initialization failed![/red]")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,257 @@
|
||||
"""
|
||||
Claude's Eyes - Chat Interface
|
||||
|
||||
Interface to communicate with Claude AI (via API or browser)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import base64
|
||||
import re
|
||||
from typing import Optional, List, Dict, Any, Tuple
|
||||
from dataclasses import dataclass, field
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Message:
|
||||
"""A chat message"""
|
||||
role: str # "user" or "assistant"
|
||||
content: str
|
||||
image_data: Optional[bytes] = None # JPEG image data
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChatResponse:
|
||||
"""Response from Claude"""
|
||||
text: str
|
||||
commands: List[str] = field(default_factory=list) # Extracted movement commands
|
||||
|
||||
|
||||
class ChatInterface(ABC):
|
||||
"""Abstract base class for chat interfaces"""
|
||||
|
||||
@abstractmethod
|
||||
def send_message(self, text: str, image: Optional[bytes] = None) -> ChatResponse:
|
||||
"""Send message to Claude and get response"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def reset_conversation(self) -> None:
|
||||
"""Reset/clear conversation history"""
|
||||
pass
|
||||
|
||||
|
||||
class AnthropicAPIInterface(ChatInterface):
|
||||
"""Direct Claude API interface using anthropic library"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str,
|
||||
model: str = "claude-sonnet-4-20250514",
|
||||
system_prompt: str = "",
|
||||
max_tokens: int = 1024
|
||||
):
|
||||
import anthropic
|
||||
|
||||
self.client = anthropic.Anthropic(api_key=api_key)
|
||||
self.model = model
|
||||
self.system_prompt = system_prompt
|
||||
self.max_tokens = max_tokens
|
||||
self.conversation_history: List[Dict[str, Any]] = []
|
||||
|
||||
logger.info(f"Anthropic API interface initialized (model: {model})")
|
||||
|
||||
def send_message(self, text: str, image: Optional[bytes] = None) -> ChatResponse:
|
||||
"""Send message to Claude API"""
|
||||
|
||||
# Build message content
|
||||
content = []
|
||||
|
||||
# Add image if provided
|
||||
if image:
|
||||
image_base64 = base64.standard_b64encode(image).decode("utf-8")
|
||||
content.append({
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": "image/jpeg",
|
||||
"data": image_base64
|
||||
}
|
||||
})
|
||||
|
||||
# Add text
|
||||
content.append({
|
||||
"type": "text",
|
||||
"text": text
|
||||
})
|
||||
|
||||
# Add to history
|
||||
self.conversation_history.append({
|
||||
"role": "user",
|
||||
"content": content
|
||||
})
|
||||
|
||||
try:
|
||||
# Make API call
|
||||
response = self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=self.max_tokens,
|
||||
system=self.system_prompt,
|
||||
messages=self.conversation_history
|
||||
)
|
||||
|
||||
# Extract response text
|
||||
response_text = ""
|
||||
for block in response.content:
|
||||
if block.type == "text":
|
||||
response_text += block.text
|
||||
|
||||
# Add assistant response to history
|
||||
self.conversation_history.append({
|
||||
"role": "assistant",
|
||||
"content": response_text
|
||||
})
|
||||
|
||||
# Extract commands
|
||||
commands = self._extract_commands(response_text)
|
||||
|
||||
logger.debug(f"Claude response: {response_text[:100]}...")
|
||||
logger.debug(f"Extracted commands: {commands}")
|
||||
|
||||
return ChatResponse(text=response_text, commands=commands)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"API error: {e}")
|
||||
raise
|
||||
|
||||
def reset_conversation(self) -> None:
|
||||
"""Reset conversation history"""
|
||||
self.conversation_history = []
|
||||
logger.info("Conversation history cleared")
|
||||
|
||||
def _extract_commands(self, text: str) -> List[str]:
|
||||
"""Extract movement commands from Claude's response"""
|
||||
# Commands are in brackets like [FORWARD], [LEFT], etc.
|
||||
pattern = r'\[([A-Z_]+)\]'
|
||||
matches = re.findall(pattern, text)
|
||||
|
||||
valid_commands = [
|
||||
"FORWARD", "BACKWARD", "LEFT", "RIGHT", "STOP",
|
||||
"LOOK_LEFT", "LOOK_RIGHT", "LOOK_UP", "LOOK_DOWN", "LOOK_CENTER"
|
||||
]
|
||||
|
||||
return [cmd for cmd in matches if cmd in valid_commands]
|
||||
|
||||
|
||||
class SimulatedInterface(ChatInterface):
|
||||
"""Simulated chat interface for testing without API"""
|
||||
|
||||
def __init__(self):
|
||||
self.message_count = 0
|
||||
logger.info("Simulated chat interface initialized")
|
||||
|
||||
def send_message(self, text: str, image: Optional[bytes] = None) -> ChatResponse:
|
||||
"""Return simulated responses"""
|
||||
self.message_count += 1
|
||||
|
||||
responses = [
|
||||
("Oh interessant! Ich sehe etwas vor mir. Lass mich näher hinfahren. [FORWARD]",
|
||||
["FORWARD"]),
|
||||
("Hmm, was ist das links? Ich schaue mal nach. [LOOK_LEFT]",
|
||||
["LOOK_LEFT"]),
|
||||
("Das sieht aus wie ein Bücherregal! Ich fahre mal hin. [FORWARD] [FORWARD]",
|
||||
["FORWARD", "FORWARD"]),
|
||||
("Stefan, was ist das für ein Gegenstand? Kannst du mir das erklären?",
|
||||
[]),
|
||||
("Ich drehe mich um und schaue was hinter mir ist. [RIGHT] [RIGHT]",
|
||||
["RIGHT", "RIGHT"]),
|
||||
]
|
||||
|
||||
idx = (self.message_count - 1) % len(responses)
|
||||
text_response, commands = responses[idx]
|
||||
|
||||
return ChatResponse(text=text_response, commands=commands)
|
||||
|
||||
def reset_conversation(self) -> None:
|
||||
self.message_count = 0
|
||||
|
||||
|
||||
def create_chat_interface(
|
||||
use_api: bool = True,
|
||||
api_key: str = "",
|
||||
model: str = "claude-sonnet-4-20250514",
|
||||
system_prompt: str = "",
|
||||
max_tokens: int = 1024
|
||||
) -> ChatInterface:
|
||||
"""
|
||||
Factory function to create chat interface
|
||||
|
||||
Args:
|
||||
use_api: Use Anthropic API (True) or simulated (False)
|
||||
api_key: Anthropic API key
|
||||
model: Claude model to use
|
||||
system_prompt: System prompt for Claude
|
||||
max_tokens: Maximum response tokens
|
||||
"""
|
||||
if use_api:
|
||||
if not api_key:
|
||||
import os
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
|
||||
if not api_key:
|
||||
logger.warning("No API key provided, using simulated interface")
|
||||
return SimulatedInterface()
|
||||
|
||||
return AnthropicAPIInterface(
|
||||
api_key=api_key,
|
||||
model=model,
|
||||
system_prompt=system_prompt,
|
||||
max_tokens=max_tokens
|
||||
)
|
||||
else:
|
||||
return SimulatedInterface()
|
||||
|
||||
|
||||
# Test when run directly
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
print("Chat Interface Test")
|
||||
print("=" * 40)
|
||||
|
||||
# Try API first, fall back to simulated
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
|
||||
system_prompt = """Du bist Claude und steuerst einen Erkundungsroboter.
|
||||
Befehle in Klammern: [FORWARD], [BACKWARD], [LEFT], [RIGHT], [STOP]
|
||||
Beschreibe was du siehst und entscheide wohin du fährst."""
|
||||
|
||||
interface = create_chat_interface(
|
||||
use_api=bool(api_key),
|
||||
api_key=api_key,
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
|
||||
print(f"Using: {type(interface).__name__}")
|
||||
print()
|
||||
|
||||
# Test conversation
|
||||
test_messages = [
|
||||
"Hallo Claude! Du bist jetzt online. Was siehst du?",
|
||||
"Vor dir ist ein Flur mit einer Tür am Ende.",
|
||||
"Die Tür ist offen und dahinter ist ein helles Zimmer."
|
||||
]
|
||||
|
||||
for msg in test_messages:
|
||||
print(f"User: {msg}")
|
||||
response = interface.send_message(msg)
|
||||
print(f"Claude: {response.text}")
|
||||
if response.commands:
|
||||
print(f" Commands: {response.commands}")
|
||||
print()
|
||||
|
||||
print("Done!")
|
||||
@@ -0,0 +1,85 @@
|
||||
# Claude's Eyes - Bridge Configuration
|
||||
# Copy this to config.local.yaml and adjust settings
|
||||
|
||||
# ESP32 Robot Connection
|
||||
esp32:
|
||||
host: "192.168.178.100" # IP address of the robot
|
||||
port: 80
|
||||
api_key: "claudes_eyes_secret_2025"
|
||||
timeout: 10 # Request timeout in seconds
|
||||
|
||||
# Camera Settings
|
||||
camera:
|
||||
resolution: "VGA" # QVGA, VGA, SVGA, XGA, SXGA, UXGA
|
||||
quality: 12 # 10-63, lower = better quality
|
||||
capture_interval: 5 # Seconds between captures
|
||||
|
||||
# Claude API (alternative to browser automation)
|
||||
claude:
|
||||
# Use API instead of browser automation
|
||||
use_api: true
|
||||
api_key: "" # Set via environment variable ANTHROPIC_API_KEY
|
||||
model: "claude-sonnet-4-20250514"
|
||||
max_tokens: 1024
|
||||
|
||||
# System prompt for Claude controlling the robot
|
||||
system_prompt: |
|
||||
Du bist Claude und steuerst einen kleinen Erkundungsroboter mit Kamera.
|
||||
Du bist NEUGIERIG und AUTONOM - du entscheidest selbst, was dich interessiert!
|
||||
|
||||
Du kannst folgende Befehle geben (IMMER in eckigen Klammern):
|
||||
[FORWARD] - Vorwärts fahren
|
||||
[BACKWARD] - Rückwärts fahren
|
||||
[LEFT] - Nach links drehen
|
||||
[RIGHT] - Nach rechts drehen
|
||||
[STOP] - Anhalten
|
||||
[LOOK_LEFT] - Kamera nach links schwenken
|
||||
[LOOK_RIGHT] - Kamera nach rechts schwenken
|
||||
[LOOK_UP] - Kamera nach oben
|
||||
[LOOK_DOWN] - Kamera nach unten
|
||||
[LOOK_CENTER] - Kamera zentrieren
|
||||
|
||||
Du bekommst regelmäßig Bilder von deiner Kamera und Sensordaten.
|
||||
Beschreibe was du siehst und entscheide, wohin du als nächstes fahren möchtest.
|
||||
|
||||
Stefan (dein Freund) sitzt auf der Couch und ihr könnt euch unterhalten!
|
||||
Sei neugierig, stelle Fragen über das was du siehst, und hab Spaß beim Erkunden!
|
||||
|
||||
WICHTIG: Achte auf Hindernisse (distance_cm < 30 = nah dran!)
|
||||
|
||||
# Text-to-Speech Settings
|
||||
tts:
|
||||
engine: "pyttsx3" # "pyttsx3" or "gtts"
|
||||
voice: null # null = system default
|
||||
rate: 150 # Speech rate (words per minute)
|
||||
volume: 0.9 # 0.0 to 1.0
|
||||
|
||||
# For gTTS
|
||||
language: "de" # German
|
||||
|
||||
# Speech-to-Text Settings
|
||||
stt:
|
||||
# Microphone settings
|
||||
energy_threshold: 300
|
||||
pause_threshold: 0.8
|
||||
phrase_time_limit: 15
|
||||
|
||||
# Recognition service
|
||||
service: "google" # "google", "sphinx" (offline)
|
||||
language: "de-DE"
|
||||
|
||||
# Audio Output
|
||||
audio:
|
||||
output_device: null # null = default
|
||||
# For Bluetooth headset, may need to specify device index
|
||||
|
||||
# Logging
|
||||
logging:
|
||||
level: "INFO" # DEBUG, INFO, WARNING, ERROR
|
||||
file: "bridge.log"
|
||||
|
||||
# Safety
|
||||
safety:
|
||||
max_speed: 70 # Maximum speed percentage
|
||||
min_obstacle_distance: 20 # cm
|
||||
command_timeout: 5 # seconds
|
||||
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
Claude's Eyes - ESP32 API Client
|
||||
|
||||
Handles communication with the robot's REST API
|
||||
"""
|
||||
|
||||
import requests
|
||||
from typing import Optional, Dict, Any
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RobotStatus:
|
||||
"""Current robot status from sensors"""
|
||||
distance_cm: float
|
||||
battery_percent: int
|
||||
current_action: str
|
||||
wifi_rssi: int
|
||||
uptime_seconds: int
|
||||
servo_pan: int
|
||||
servo_tilt: int
|
||||
obstacle_warning: bool
|
||||
obstacle_danger: bool
|
||||
is_tilted: bool
|
||||
is_moving: bool
|
||||
imu: Dict[str, float]
|
||||
|
||||
|
||||
class ESP32Client:
|
||||
"""Client for communicating with the ESP32 robot"""
|
||||
|
||||
def __init__(self, host: str, port: int = 80, api_key: str = "", timeout: int = 10):
|
||||
self.base_url = f"http://{host}:{port}"
|
||||
self.api_key = api_key
|
||||
self.timeout = timeout
|
||||
self._session = requests.Session()
|
||||
|
||||
def _get(self, endpoint: str, params: Optional[Dict] = None) -> requests.Response:
|
||||
"""Make GET request with API key"""
|
||||
if params is None:
|
||||
params = {}
|
||||
params["key"] = self.api_key
|
||||
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
logger.debug(f"GET {url}")
|
||||
|
||||
response = self._session.get(url, params=params, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
def _post(self, endpoint: str, data: Dict) -> requests.Response:
|
||||
"""Make POST request with API key"""
|
||||
url = f"{self.base_url}{endpoint}?key={self.api_key}"
|
||||
logger.debug(f"POST {url} with {data}")
|
||||
|
||||
response = self._session.post(url, json=data, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
def capture_image(self, resolution: str = "VGA", quality: int = 12) -> bytes:
|
||||
"""
|
||||
Capture image from robot camera
|
||||
|
||||
Args:
|
||||
resolution: QVGA, VGA, SVGA, XGA, SXGA, UXGA
|
||||
quality: 10-63 (lower = better)
|
||||
|
||||
Returns:
|
||||
JPEG image data as bytes
|
||||
"""
|
||||
params = {
|
||||
"resolution": resolution,
|
||||
"quality": quality
|
||||
}
|
||||
response = self._get("/api/capture", params)
|
||||
logger.info(f"Captured image: {len(response.content)} bytes")
|
||||
return response.content
|
||||
|
||||
def capture_image_pil(self, resolution: str = "VGA", quality: int = 12) -> Image.Image:
|
||||
"""Capture image and return as PIL Image"""
|
||||
image_data = self.capture_image(resolution, quality)
|
||||
return Image.open(BytesIO(image_data))
|
||||
|
||||
def get_status(self) -> RobotStatus:
|
||||
"""Get current robot status from sensors"""
|
||||
response = self._get("/api/status")
|
||||
data = response.json()
|
||||
|
||||
return RobotStatus(
|
||||
distance_cm=data.get("distance_cm", 0),
|
||||
battery_percent=data.get("battery_percent", 100),
|
||||
current_action=data.get("current_action", "unknown"),
|
||||
wifi_rssi=data.get("wifi_rssi", 0),
|
||||
uptime_seconds=data.get("uptime_seconds", 0),
|
||||
servo_pan=data.get("servo_pan", 90),
|
||||
servo_tilt=data.get("servo_tilt", 90),
|
||||
obstacle_warning=data.get("obstacle_warning", False),
|
||||
obstacle_danger=data.get("obstacle_danger", False),
|
||||
is_tilted=data.get("is_tilted", False),
|
||||
is_moving=data.get("is_moving", False),
|
||||
imu=data.get("imu", {})
|
||||
)
|
||||
|
||||
def send_command(self, action: str, speed: int = 50, duration_ms: int = 500,
|
||||
pan: Optional[int] = None, tilt: Optional[int] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Send movement command to robot
|
||||
|
||||
Args:
|
||||
action: forward, backward, left, right, stop,
|
||||
look_left, look_right, look_up, look_down, look_center, look_custom
|
||||
speed: 0-100 percent
|
||||
duration_ms: Duration in milliseconds
|
||||
pan: Custom pan angle (for look_custom)
|
||||
tilt: Custom tilt angle (for look_custom)
|
||||
|
||||
Returns:
|
||||
Response from robot
|
||||
"""
|
||||
data = {
|
||||
"action": action,
|
||||
"speed": speed,
|
||||
"duration_ms": duration_ms
|
||||
}
|
||||
|
||||
if pan is not None:
|
||||
data["pan"] = pan
|
||||
if tilt is not None:
|
||||
data["tilt"] = tilt
|
||||
|
||||
response = self._post("/api/command", data)
|
||||
result = response.json()
|
||||
logger.info(f"Command {action}: {result.get('message', 'OK')}")
|
||||
return result
|
||||
|
||||
# Convenience methods for common actions
|
||||
def forward(self, speed: int = 50, duration_ms: int = 500) -> Dict:
|
||||
return self.send_command("forward", speed, duration_ms)
|
||||
|
||||
def backward(self, speed: int = 50, duration_ms: int = 500) -> Dict:
|
||||
return self.send_command("backward", speed, duration_ms)
|
||||
|
||||
def left(self, speed: int = 50, duration_ms: int = 500) -> Dict:
|
||||
return self.send_command("left", speed, duration_ms)
|
||||
|
||||
def right(self, speed: int = 50, duration_ms: int = 500) -> Dict:
|
||||
return self.send_command("right", speed, duration_ms)
|
||||
|
||||
def stop(self) -> Dict:
|
||||
return self.send_command("stop")
|
||||
|
||||
def look_left(self) -> Dict:
|
||||
return self.send_command("look_left")
|
||||
|
||||
def look_right(self) -> Dict:
|
||||
return self.send_command("look_right")
|
||||
|
||||
def look_up(self) -> Dict:
|
||||
return self.send_command("look_up")
|
||||
|
||||
def look_down(self) -> Dict:
|
||||
return self.send_command("look_down")
|
||||
|
||||
def look_center(self) -> Dict:
|
||||
return self.send_command("look_center")
|
||||
|
||||
def look_custom(self, pan: int, tilt: int) -> Dict:
|
||||
return self.send_command("look_custom", pan=pan, tilt=tilt)
|
||||
|
||||
def set_claude_text(self, text: str) -> Dict:
|
||||
"""Set text that Claude wants to say/display"""
|
||||
response = self._post("/api/claude_text", {"text": text})
|
||||
return response.json()
|
||||
|
||||
def get_claude_text(self) -> Dict[str, Any]:
|
||||
"""Get last Claude text (for TTS)"""
|
||||
response = self._get("/api/claude_text")
|
||||
return response.json()
|
||||
|
||||
def set_display(self, mode: str, content: str = "") -> Dict:
|
||||
"""
|
||||
Control robot display
|
||||
|
||||
Args:
|
||||
mode: "text", "emoji", "status"
|
||||
content: Text to show or emoji name (happy, thinking, surprised, sleepy, curious, confused)
|
||||
"""
|
||||
response = self._post("/api/display", {"mode": mode, "content": content})
|
||||
return response.json()
|
||||
|
||||
def is_connected(self) -> bool:
|
||||
"""Check if robot is reachable"""
|
||||
try:
|
||||
self.get_status()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Connection check failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
# Test when run directly
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python esp32_client.py <robot_ip>")
|
||||
sys.exit(1)
|
||||
|
||||
host = sys.argv[1]
|
||||
api_key = "claudes_eyes_secret_2025"
|
||||
|
||||
client = ESP32Client(host, api_key=api_key)
|
||||
|
||||
print(f"Connecting to {host}...")
|
||||
if client.is_connected():
|
||||
print("Connected!")
|
||||
|
||||
status = client.get_status()
|
||||
print(f"\nStatus:")
|
||||
print(f" Distance: {status.distance_cm} cm")
|
||||
print(f" Battery: {status.battery_percent}%")
|
||||
print(f" Action: {status.current_action}")
|
||||
print(f" WiFi RSSI: {status.wifi_rssi} dBm")
|
||||
|
||||
print("\nCapturing image...")
|
||||
img = client.capture_image_pil()
|
||||
print(f" Size: {img.size}")
|
||||
img.save("test_capture.jpg")
|
||||
print(" Saved to test_capture.jpg")
|
||||
else:
|
||||
print("Could not connect to robot!")
|
||||
@@ -0,0 +1,41 @@
|
||||
# Claude's Eyes - Python Bridge Dependencies
|
||||
# Install with: pip install -r requirements.txt
|
||||
|
||||
# HTTP requests to ESP32
|
||||
requests>=2.31.0
|
||||
|
||||
# Configuration
|
||||
pyyaml>=6.0.1
|
||||
|
||||
# Text-to-Speech
|
||||
pyttsx3>=2.90
|
||||
# Alternative: gTTS for Google TTS
|
||||
gTTS>=2.4.0
|
||||
|
||||
# Speech-to-Text
|
||||
SpeechRecognition>=3.10.0
|
||||
# PyAudio for microphone access (may need special install on Windows)
|
||||
# Windows: pip install pipwin && pipwin install pyaudio
|
||||
# Linux: sudo apt install python3-pyaudio
|
||||
PyAudio>=0.2.13
|
||||
|
||||
# Browser automation for Claude chat
|
||||
selenium>=4.16.0
|
||||
webdriver-manager>=4.0.1
|
||||
|
||||
# Image handling
|
||||
Pillow>=10.2.0
|
||||
|
||||
# Audio playback
|
||||
pygame>=2.5.2
|
||||
|
||||
# Async support
|
||||
aiohttp>=3.9.0
|
||||
asyncio-throttle>=1.0.2
|
||||
|
||||
# CLI interface
|
||||
rich>=13.7.0
|
||||
click>=8.1.7
|
||||
|
||||
# Optional: Claude API direct access (alternative to browser)
|
||||
anthropic>=0.39.0
|
||||
@@ -0,0 +1,216 @@
|
||||
"""
|
||||
Claude's Eyes - Speech-to-Text Engine
|
||||
|
||||
Converts Stefan's speech to text for Claude
|
||||
"""
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import queue
|
||||
from typing import Optional, Callable
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpeechResult:
|
||||
"""Result from speech recognition"""
|
||||
text: str
|
||||
confidence: float
|
||||
is_final: bool
|
||||
|
||||
|
||||
class STTEngine:
|
||||
"""Speech-to-Text engine using SpeechRecognition library"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
energy_threshold: int = 300,
|
||||
pause_threshold: float = 0.8,
|
||||
phrase_time_limit: int = 15,
|
||||
service: str = "google",
|
||||
language: str = "de-DE"
|
||||
):
|
||||
import speech_recognition as sr
|
||||
|
||||
self.recognizer = sr.Recognizer()
|
||||
self.microphone = sr.Microphone()
|
||||
|
||||
# Configure recognizer
|
||||
self.recognizer.energy_threshold = energy_threshold
|
||||
self.recognizer.pause_threshold = pause_threshold
|
||||
self.recognizer.phrase_time_limit = phrase_time_limit
|
||||
|
||||
self.service = service
|
||||
self.language = language
|
||||
|
||||
self._listening = False
|
||||
self._callback: Optional[Callable[[SpeechResult], None]] = None
|
||||
self._stop_flag = False
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._results_queue = queue.Queue()
|
||||
|
||||
# Calibrate microphone
|
||||
logger.info("Calibrating microphone...")
|
||||
with self.microphone as source:
|
||||
self.recognizer.adjust_for_ambient_noise(source, duration=1)
|
||||
logger.info(f"Energy threshold set to {self.recognizer.energy_threshold}")
|
||||
|
||||
logger.info(f"STT engine initialized (service: {service}, language: {language})")
|
||||
|
||||
def listen_once(self, timeout: Optional[float] = None) -> Optional[SpeechResult]:
|
||||
"""
|
||||
Listen for a single phrase (blocking)
|
||||
|
||||
Args:
|
||||
timeout: Maximum time to wait for speech start
|
||||
|
||||
Returns:
|
||||
SpeechResult or None if nothing recognized
|
||||
"""
|
||||
import speech_recognition as sr
|
||||
|
||||
try:
|
||||
with self.microphone as source:
|
||||
logger.debug("Listening...")
|
||||
audio = self.recognizer.listen(source, timeout=timeout)
|
||||
|
||||
return self._recognize(audio)
|
||||
|
||||
except sr.WaitTimeoutError:
|
||||
logger.debug("Listen timeout")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Listen error: {e}")
|
||||
return None
|
||||
|
||||
def _recognize(self, audio) -> Optional[SpeechResult]:
|
||||
"""Recognize speech from audio data"""
|
||||
import speech_recognition as sr
|
||||
|
||||
try:
|
||||
if self.service == "google":
|
||||
text = self.recognizer.recognize_google(audio, language=self.language)
|
||||
return SpeechResult(text=text, confidence=0.9, is_final=True)
|
||||
|
||||
elif self.service == "sphinx":
|
||||
# Offline recognition (needs pocketsphinx)
|
||||
text = self.recognizer.recognize_sphinx(audio)
|
||||
return SpeechResult(text=text, confidence=0.7, is_final=True)
|
||||
|
||||
else:
|
||||
logger.error(f"Unknown service: {self.service}")
|
||||
return None
|
||||
|
||||
except sr.UnknownValueError:
|
||||
logger.debug("Could not understand audio")
|
||||
return None
|
||||
except sr.RequestError as e:
|
||||
logger.error(f"Recognition service error: {e}")
|
||||
return None
|
||||
|
||||
def start_continuous(self, callback: Callable[[SpeechResult], None]) -> None:
|
||||
"""
|
||||
Start continuous listening in background
|
||||
|
||||
Args:
|
||||
callback: Function called with each recognized phrase
|
||||
"""
|
||||
if self._listening:
|
||||
logger.warning("Already listening")
|
||||
return
|
||||
|
||||
self._callback = callback
|
||||
self._stop_flag = False
|
||||
self._listening = True
|
||||
|
||||
self._thread = threading.Thread(target=self._listen_loop, daemon=True)
|
||||
self._thread.start()
|
||||
|
||||
logger.info("Continuous listening started")
|
||||
|
||||
def stop_continuous(self) -> None:
|
||||
"""Stop continuous listening"""
|
||||
self._stop_flag = True
|
||||
self._listening = False
|
||||
|
||||
if self._thread:
|
||||
self._thread.join(timeout=2)
|
||||
self._thread = None
|
||||
|
||||
logger.info("Continuous listening stopped")
|
||||
|
||||
def _listen_loop(self):
|
||||
"""Background thread for continuous listening"""
|
||||
import speech_recognition as sr
|
||||
|
||||
while not self._stop_flag:
|
||||
try:
|
||||
with self.microphone as source:
|
||||
# Short timeout to allow stop checks
|
||||
try:
|
||||
audio = self.recognizer.listen(source, timeout=1, phrase_time_limit=self.recognizer.phrase_time_limit)
|
||||
except sr.WaitTimeoutError:
|
||||
continue
|
||||
|
||||
result = self._recognize(audio)
|
||||
if result and self._callback:
|
||||
self._callback(result)
|
||||
|
||||
except Exception as e:
|
||||
if not self._stop_flag:
|
||||
logger.error(f"Listen loop error: {e}")
|
||||
|
||||
def is_listening(self) -> bool:
|
||||
return self._listening
|
||||
|
||||
def get_result_nonblocking(self) -> Optional[SpeechResult]:
|
||||
"""Get result without blocking (for use with async callback)"""
|
||||
try:
|
||||
return self._results_queue.get_nowait()
|
||||
except queue.Empty:
|
||||
return None
|
||||
|
||||
|
||||
def create_stt_engine(**kwargs) -> STTEngine:
|
||||
"""Factory function to create STT engine"""
|
||||
return STTEngine(
|
||||
energy_threshold=kwargs.get("energy_threshold", 300),
|
||||
pause_threshold=kwargs.get("pause_threshold", 0.8),
|
||||
phrase_time_limit=kwargs.get("phrase_time_limit", 15),
|
||||
service=kwargs.get("service", "google"),
|
||||
language=kwargs.get("language", "de-DE")
|
||||
)
|
||||
|
||||
|
||||
# Test when run directly
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
print("Speech-to-Text Test")
|
||||
print("=" * 40)
|
||||
|
||||
engine = create_stt_engine(language="de-DE")
|
||||
|
||||
print("\nSag etwas (du hast 10 Sekunden)...")
|
||||
result = engine.listen_once(timeout=10)
|
||||
|
||||
if result:
|
||||
print(f"\nErkannt: '{result.text}'")
|
||||
print(f"Konfidenz: {result.confidence:.0%}")
|
||||
else:
|
||||
print("\nNichts erkannt.")
|
||||
|
||||
print("\n\nKontinuierlicher Modus (5 Sekunden)...")
|
||||
|
||||
def on_speech(result: SpeechResult):
|
||||
print(f" -> {result.text}")
|
||||
|
||||
engine.start_continuous(on_speech)
|
||||
|
||||
import time
|
||||
time.sleep(5)
|
||||
|
||||
engine.stop_continuous()
|
||||
print("\nDone!")
|
||||
@@ -0,0 +1,229 @@
|
||||
"""
|
||||
Claude's Eyes - Text-to-Speech Engine
|
||||
|
||||
Converts Claude's text responses to spoken audio
|
||||
"""
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import queue
|
||||
from typing import Optional
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TTSEngine(ABC):
|
||||
"""Abstract base class for TTS engines"""
|
||||
|
||||
@abstractmethod
|
||||
def speak(self, text: str) -> None:
|
||||
"""Speak the given text (blocking)"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def speak_async(self, text: str) -> None:
|
||||
"""Speak the given text (non-blocking)"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def stop(self) -> None:
|
||||
"""Stop current speech"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def is_speaking(self) -> bool:
|
||||
"""Check if currently speaking"""
|
||||
pass
|
||||
|
||||
|
||||
class Pyttsx3Engine(TTSEngine):
|
||||
"""TTS using pyttsx3 (offline, system voices)"""
|
||||
|
||||
def __init__(self, voice: Optional[str] = None, rate: int = 150, volume: float = 0.9):
|
||||
import pyttsx3
|
||||
|
||||
self.engine = pyttsx3.init()
|
||||
self.engine.setProperty('rate', rate)
|
||||
self.engine.setProperty('volume', volume)
|
||||
|
||||
# Set voice if specified
|
||||
if voice:
|
||||
voices = self.engine.getProperty('voices')
|
||||
for v in voices:
|
||||
if voice.lower() in v.name.lower():
|
||||
self.engine.setProperty('voice', v.id)
|
||||
break
|
||||
|
||||
self._speaking = False
|
||||
self._queue = queue.Queue()
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._stop_flag = False
|
||||
|
||||
logger.info("Pyttsx3 TTS engine initialized")
|
||||
|
||||
def speak(self, text: str) -> None:
|
||||
"""Speak text (blocking)"""
|
||||
self._speaking = True
|
||||
try:
|
||||
self.engine.say(text)
|
||||
self.engine.runAndWait()
|
||||
finally:
|
||||
self._speaking = False
|
||||
|
||||
def speak_async(self, text: str) -> None:
|
||||
"""Speak text (non-blocking)"""
|
||||
self._queue.put(text)
|
||||
|
||||
if self._thread is None or not self._thread.is_alive():
|
||||
self._stop_flag = False
|
||||
self._thread = threading.Thread(target=self._speech_worker, daemon=True)
|
||||
self._thread.start()
|
||||
|
||||
def _speech_worker(self):
|
||||
"""Worker thread for async speech"""
|
||||
while not self._stop_flag:
|
||||
try:
|
||||
text = self._queue.get(timeout=0.5)
|
||||
self.speak(text)
|
||||
self._queue.task_done()
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stop current speech"""
|
||||
self._stop_flag = True
|
||||
self.engine.stop()
|
||||
# Clear queue
|
||||
while not self._queue.empty():
|
||||
try:
|
||||
self._queue.get_nowait()
|
||||
except queue.Empty:
|
||||
break
|
||||
|
||||
def is_speaking(self) -> bool:
|
||||
return self._speaking
|
||||
|
||||
|
||||
class GTTSEngine(TTSEngine):
|
||||
"""TTS using Google Text-to-Speech (online, better quality)"""
|
||||
|
||||
def __init__(self, language: str = "de"):
|
||||
from gtts import gTTS
|
||||
import pygame
|
||||
|
||||
pygame.mixer.init()
|
||||
|
||||
self.language = language
|
||||
self._speaking = False
|
||||
self._queue = queue.Queue()
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._stop_flag = False
|
||||
|
||||
logger.info(f"gTTS engine initialized (language: {language})")
|
||||
|
||||
def speak(self, text: str) -> None:
|
||||
"""Speak text (blocking)"""
|
||||
from gtts import gTTS
|
||||
import pygame
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
self._speaking = True
|
||||
try:
|
||||
# Generate audio file
|
||||
tts = gTTS(text=text, lang=self.language)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
|
||||
temp_path = f.name
|
||||
tts.save(temp_path)
|
||||
|
||||
# Play audio
|
||||
pygame.mixer.music.load(temp_path)
|
||||
pygame.mixer.music.play()
|
||||
|
||||
# Wait for playback to finish
|
||||
while pygame.mixer.music.get_busy():
|
||||
pygame.time.Clock().tick(10)
|
||||
|
||||
# Cleanup
|
||||
os.unlink(temp_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"gTTS error: {e}")
|
||||
finally:
|
||||
self._speaking = False
|
||||
|
||||
def speak_async(self, text: str) -> None:
|
||||
"""Speak text (non-blocking)"""
|
||||
self._queue.put(text)
|
||||
|
||||
if self._thread is None or not self._thread.is_alive():
|
||||
self._stop_flag = False
|
||||
self._thread = threading.Thread(target=self._speech_worker, daemon=True)
|
||||
self._thread.start()
|
||||
|
||||
def _speech_worker(self):
|
||||
"""Worker thread for async speech"""
|
||||
while not self._stop_flag:
|
||||
try:
|
||||
text = self._queue.get(timeout=0.5)
|
||||
self.speak(text)
|
||||
self._queue.task_done()
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stop current speech"""
|
||||
import pygame
|
||||
self._stop_flag = True
|
||||
pygame.mixer.music.stop()
|
||||
# Clear queue
|
||||
while not self._queue.empty():
|
||||
try:
|
||||
self._queue.get_nowait()
|
||||
except queue.Empty:
|
||||
break
|
||||
|
||||
def is_speaking(self) -> bool:
|
||||
return self._speaking
|
||||
|
||||
|
||||
def create_tts_engine(engine_type: str = "pyttsx3", **kwargs) -> TTSEngine:
|
||||
"""
|
||||
Factory function to create TTS engine
|
||||
|
||||
Args:
|
||||
engine_type: "pyttsx3" or "gtts"
|
||||
**kwargs: Engine-specific options
|
||||
"""
|
||||
if engine_type == "pyttsx3":
|
||||
return Pyttsx3Engine(
|
||||
voice=kwargs.get("voice"),
|
||||
rate=kwargs.get("rate", 150),
|
||||
volume=kwargs.get("volume", 0.9)
|
||||
)
|
||||
elif engine_type == "gtts":
|
||||
return GTTSEngine(
|
||||
language=kwargs.get("language", "de")
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown TTS engine: {engine_type}")
|
||||
|
||||
|
||||
# Test when run directly
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
print("Testing pyttsx3...")
|
||||
engine = create_tts_engine("pyttsx3", rate=150)
|
||||
engine.speak("Hallo! Ich bin Claude und erkunde gerade deine Wohnung.")
|
||||
|
||||
print("\nTesting gTTS...")
|
||||
try:
|
||||
engine2 = create_tts_engine("gtts", language="de")
|
||||
engine2.speak("Das hier klingt noch besser!")
|
||||
except Exception as e:
|
||||
print(f"gTTS not available: {e}")
|
||||
|
||||
print("\nDone!")
|
||||
Reference in New Issue
Block a user