init version

This commit is contained in:
2025-12-26 15:09:25 +01:00
commit ed2964bbbf
27 changed files with 5013 additions and 0 deletions
+424
View File
@@ -0,0 +1,424 @@
#!/usr/bin/env python3
"""
Claude's Eyes - Main Bridge Script
Connects the ESP32 robot with Claude AI for autonomous exploration.
Usage:
python bridge.py # Use config.yaml
python bridge.py --config my.yaml # Use custom config
python bridge.py --simulate # Simulate without hardware
"""
import os
import sys
import time
import logging
import threading
import signal
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
import yaml
import click
from rich.console import Console
from rich.panel import Panel
from rich.live import Live
from rich.table import Table
from rich.text import Text
from esp32_client import ESP32Client, RobotStatus
from tts_engine import create_tts_engine, TTSEngine
from stt_engine import create_stt_engine, STTEngine, SpeechResult
from chat_interface import create_chat_interface, ChatInterface, ChatResponse
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Rich console for pretty output
console = Console()
@dataclass
class BridgeState:
"""Current state of the bridge"""
connected: bool = False
exploring: bool = False
last_image_time: float = 0
last_status: Optional[RobotStatus] = None
last_claude_response: str = ""
stefan_input: str = ""
error_message: str = ""
class ClaudesEyesBridge:
"""Main bridge class connecting robot and Claude"""
def __init__(self, config_path: str, simulate: bool = False):
self.config = self._load_config(config_path)
self.simulate = simulate
self.state = BridgeState()
self.running = False
# Components
self.robot: Optional[ESP32Client] = None
self.chat: Optional[ChatInterface] = None
self.tts: Optional[TTSEngine] = None
self.stt: Optional[STTEngine] = None
# Threading
self.speech_thread: Optional[threading.Thread] = None
self._stop_event = threading.Event()
def _load_config(self, config_path: str) -> dict:
"""Load configuration from YAML file"""
path = Path(config_path)
# Try local config first
local_path = path.parent / f"{path.stem}.local{path.suffix}"
if local_path.exists():
path = local_path
logger.info(f"Using local config: {path}")
if not path.exists():
logger.error(f"Config file not found: {path}")
sys.exit(1)
with open(path) as f:
config = yaml.safe_load(f)
return config
def initialize(self) -> bool:
"""Initialize all components"""
console.print(Panel.fit(
"[bold cyan]Claude's Eyes[/bold cyan]\n"
"[dim]Autonomous Exploration Robot[/dim]",
border_style="cyan"
))
# Initialize robot client
if not self.simulate:
console.print("\n[yellow]Connecting to robot...[/yellow]")
esp_config = self.config.get("esp32", {})
self.robot = ESP32Client(
host=esp_config.get("host", "192.168.178.100"),
port=esp_config.get("port", 80),
api_key=esp_config.get("api_key", ""),
timeout=esp_config.get("timeout", 10)
)
if not self.robot.is_connected():
console.print("[red]Could not connect to robot![/red]")
self.state.error_message = "Robot connection failed"
return False
self.state.connected = True
console.print("[green]Robot connected![/green]")
else:
console.print("[yellow]Simulation mode - no robot connection[/yellow]")
self.state.connected = True
# Initialize Claude interface
console.print("\n[yellow]Initializing Claude interface...[/yellow]")
claude_config = self.config.get("claude", {})
api_key = claude_config.get("api_key") or os.environ.get("ANTHROPIC_API_KEY", "")
self.chat = create_chat_interface(
use_api=claude_config.get("use_api", True) and bool(api_key),
api_key=api_key,
model=claude_config.get("model", "claude-sonnet-4-20250514"),
system_prompt=claude_config.get("system_prompt", ""),
max_tokens=claude_config.get("max_tokens", 1024)
)
console.print(f"[green]Chat interface ready ({type(self.chat).__name__})[/green]")
# Initialize TTS
console.print("\n[yellow]Initializing Text-to-Speech...[/yellow]")
tts_config = self.config.get("tts", {})
try:
self.tts = create_tts_engine(
engine_type=tts_config.get("engine", "pyttsx3"),
voice=tts_config.get("voice"),
rate=tts_config.get("rate", 150),
volume=tts_config.get("volume", 0.9),
language=tts_config.get("language", "de")
)
console.print("[green]TTS ready![/green]")
except Exception as e:
console.print(f"[red]TTS init failed: {e}[/red]")
self.tts = None
# Initialize STT
console.print("\n[yellow]Initializing Speech-to-Text...[/yellow]")
stt_config = self.config.get("stt", {})
try:
self.stt = create_stt_engine(
energy_threshold=stt_config.get("energy_threshold", 300),
pause_threshold=stt_config.get("pause_threshold", 0.8),
phrase_time_limit=stt_config.get("phrase_time_limit", 15),
service=stt_config.get("service", "google"),
language=stt_config.get("language", "de-DE")
)
console.print("[green]STT ready![/green]")
except Exception as e:
console.print(f"[red]STT init failed: {e}[/red]")
self.stt = None
console.print("\n[bold green]All systems initialized![/bold green]\n")
return True
def start(self):
"""Start the main exploration loop"""
self.running = True
self.state.exploring = True
# Start speech recognition in background
if self.stt:
self.stt.start_continuous(self._on_speech_detected)
# Welcome message
welcome = "Hallo Stefan! Ich bin online und bereit zum Erkunden. Was soll ich mir anschauen?"
self._speak(welcome)
self.state.last_claude_response = welcome
try:
self._main_loop()
except KeyboardInterrupt:
console.print("\n[yellow]Stopping...[/yellow]")
finally:
self.stop()
def stop(self):
"""Stop the bridge"""
self.running = False
self.state.exploring = False
self._stop_event.set()
if self.stt:
self.stt.stop_continuous()
if self.tts:
self.tts.stop()
if self.robot and not self.simulate:
self.robot.stop()
console.print("[yellow]Bridge stopped[/yellow]")
def _main_loop(self):
"""Main exploration loop"""
camera_config = self.config.get("camera", {})
capture_interval = camera_config.get("capture_interval", 5)
while self.running:
try:
current_time = time.time()
# Capture and process image periodically
if current_time - self.state.last_image_time >= capture_interval:
self._exploration_step()
self.state.last_image_time = current_time
# Update status display
self._update_display()
# Small delay
time.sleep(0.1)
except Exception as e:
logger.error(f"Loop error: {e}")
self.state.error_message = str(e)
time.sleep(1)
def _exploration_step(self):
"""Single exploration step: capture, analyze, act"""
# Get robot status
if self.robot and not self.simulate:
try:
self.state.last_status = self.robot.get_status()
except Exception as e:
logger.error(f"Status error: {e}")
# Capture image
image_data = None
if self.robot and not self.simulate:
try:
camera_config = self.config.get("camera", {})
image_data = self.robot.capture_image(
resolution=camera_config.get("resolution", "VGA"),
quality=camera_config.get("quality", 12)
)
except Exception as e:
logger.error(f"Capture error: {e}")
# Build context message
context = self._build_context_message()
# Add Stefan's input if any
if self.state.stefan_input:
context += f"\n\nStefan sagt: {self.state.stefan_input}"
self.state.stefan_input = ""
# Send to Claude
try:
response = self.chat.send_message(context, image=image_data)
self.state.last_claude_response = response.text
# Speak response
self._speak(response.text)
# Execute commands
self._execute_commands(response.commands)
# Update robot display
if self.robot and not self.simulate:
# Send short version to robot display
short_text = response.text[:100] + "..." if len(response.text) > 100 else response.text
self.robot.set_claude_text(short_text)
except Exception as e:
logger.error(f"Chat error: {e}")
self.state.error_message = str(e)
def _build_context_message(self) -> str:
"""Build context message with sensor data"""
parts = ["Hier ist was ich gerade sehe und meine Sensordaten:"]
if self.state.last_status:
status = self.state.last_status
parts.append(f"\n- Abstand zum nächsten Hindernis: {status.distance_cm:.0f} cm")
parts.append(f"- Aktuelle Aktion: {status.current_action}")
parts.append(f"- Batterie: {status.battery_percent}%")
if status.obstacle_danger:
parts.append("- WARNUNG: Hindernis sehr nah!")
elif status.obstacle_warning:
parts.append("- Hinweis: Hindernis in der Nähe")
if status.is_tilted:
parts.append("- WARNUNG: Ich bin schief!")
parts.append("\nWas siehst du auf dem Bild? Was möchtest du als nächstes tun?")
return "\n".join(parts)
def _execute_commands(self, commands: list):
"""Execute movement commands from Claude"""
if not commands:
return
if self.simulate:
console.print(f"[dim]Simulated commands: {commands}[/dim]")
return
if not self.robot:
return
safety = self.config.get("safety", {})
max_speed = safety.get("max_speed", 70)
min_distance = safety.get("min_obstacle_distance", 20)
for cmd in commands:
# Safety check
if self.state.last_status and self.state.last_status.distance_cm < min_distance:
if cmd == "FORWARD":
console.print("[red]Blocked: Obstacle too close![/red]")
continue
try:
if cmd == "FORWARD":
self.robot.forward(speed=max_speed, duration_ms=800)
elif cmd == "BACKWARD":
self.robot.backward(speed=max_speed, duration_ms=800)
elif cmd == "LEFT":
self.robot.left(speed=max_speed, duration_ms=400)
elif cmd == "RIGHT":
self.robot.right(speed=max_speed, duration_ms=400)
elif cmd == "STOP":
self.robot.stop()
elif cmd == "LOOK_LEFT":
self.robot.look_left()
elif cmd == "LOOK_RIGHT":
self.robot.look_right()
elif cmd == "LOOK_UP":
self.robot.look_up()
elif cmd == "LOOK_DOWN":
self.robot.look_down()
elif cmd == "LOOK_CENTER":
self.robot.look_center()
# Small delay between commands
time.sleep(0.3)
except Exception as e:
logger.error(f"Command error ({cmd}): {e}")
def _speak(self, text: str):
"""Speak text using TTS"""
if self.tts:
# Remove command brackets from speech
import re
clean_text = re.sub(r'\[[A-Z_]+\]', '', text).strip()
if clean_text:
self.tts.speak_async(clean_text)
def _on_speech_detected(self, result: SpeechResult):
"""Callback when Stefan says something"""
console.print(f"\n[bold blue]Stefan:[/bold blue] {result.text}")
self.state.stefan_input = result.text
def _update_display(self):
"""Update console display"""
# This could be enhanced with rich.live for real-time updates
pass
def signal_handler(signum, frame):
"""Handle Ctrl+C gracefully"""
console.print("\n[yellow]Received stop signal...[/yellow]")
sys.exit(0)
@click.command()
@click.option('--config', '-c', default='config.yaml', help='Path to config file')
@click.option('--simulate', '-s', is_flag=True, help='Simulate without hardware')
@click.option('--debug', '-d', is_flag=True, help='Enable debug logging')
def main(config: str, simulate: bool, debug: bool):
"""Claude's Eyes - Autonomous Exploration Robot Bridge"""
if debug:
logging.getLogger().setLevel(logging.DEBUG)
# Handle signals
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
# Find config file
config_path = Path(config)
if not config_path.is_absolute():
# Look in script directory first
script_dir = Path(__file__).parent
if (script_dir / config).exists():
config_path = script_dir / config
# Create and run bridge
bridge = ClaudesEyesBridge(str(config_path), simulate=simulate)
if bridge.initialize():
console.print("\n[bold cyan]Starting exploration...[/bold cyan]")
console.print("[dim]Press Ctrl+C to stop[/dim]\n")
bridge.start()
else:
console.print("[red]Initialization failed![/red]")
sys.exit(1)
if __name__ == "__main__":
main()
@@ -0,0 +1,257 @@
"""
Claude's Eyes - Chat Interface
Interface to communicate with Claude AI (via API or browser)
"""
import logging
import base64
import re
from typing import Optional, List, Dict, Any, Tuple
from dataclasses import dataclass, field
from abc import ABC, abstractmethod
logger = logging.getLogger(__name__)
@dataclass
class Message:
"""A chat message"""
role: str # "user" or "assistant"
content: str
image_data: Optional[bytes] = None # JPEG image data
@dataclass
class ChatResponse:
"""Response from Claude"""
text: str
commands: List[str] = field(default_factory=list) # Extracted movement commands
class ChatInterface(ABC):
"""Abstract base class for chat interfaces"""
@abstractmethod
def send_message(self, text: str, image: Optional[bytes] = None) -> ChatResponse:
"""Send message to Claude and get response"""
pass
@abstractmethod
def reset_conversation(self) -> None:
"""Reset/clear conversation history"""
pass
class AnthropicAPIInterface(ChatInterface):
"""Direct Claude API interface using anthropic library"""
def __init__(
self,
api_key: str,
model: str = "claude-sonnet-4-20250514",
system_prompt: str = "",
max_tokens: int = 1024
):
import anthropic
self.client = anthropic.Anthropic(api_key=api_key)
self.model = model
self.system_prompt = system_prompt
self.max_tokens = max_tokens
self.conversation_history: List[Dict[str, Any]] = []
logger.info(f"Anthropic API interface initialized (model: {model})")
def send_message(self, text: str, image: Optional[bytes] = None) -> ChatResponse:
"""Send message to Claude API"""
# Build message content
content = []
# Add image if provided
if image:
image_base64 = base64.standard_b64encode(image).decode("utf-8")
content.append({
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": image_base64
}
})
# Add text
content.append({
"type": "text",
"text": text
})
# Add to history
self.conversation_history.append({
"role": "user",
"content": content
})
try:
# Make API call
response = self.client.messages.create(
model=self.model,
max_tokens=self.max_tokens,
system=self.system_prompt,
messages=self.conversation_history
)
# Extract response text
response_text = ""
for block in response.content:
if block.type == "text":
response_text += block.text
# Add assistant response to history
self.conversation_history.append({
"role": "assistant",
"content": response_text
})
# Extract commands
commands = self._extract_commands(response_text)
logger.debug(f"Claude response: {response_text[:100]}...")
logger.debug(f"Extracted commands: {commands}")
return ChatResponse(text=response_text, commands=commands)
except Exception as e:
logger.error(f"API error: {e}")
raise
def reset_conversation(self) -> None:
"""Reset conversation history"""
self.conversation_history = []
logger.info("Conversation history cleared")
def _extract_commands(self, text: str) -> List[str]:
"""Extract movement commands from Claude's response"""
# Commands are in brackets like [FORWARD], [LEFT], etc.
pattern = r'\[([A-Z_]+)\]'
matches = re.findall(pattern, text)
valid_commands = [
"FORWARD", "BACKWARD", "LEFT", "RIGHT", "STOP",
"LOOK_LEFT", "LOOK_RIGHT", "LOOK_UP", "LOOK_DOWN", "LOOK_CENTER"
]
return [cmd for cmd in matches if cmd in valid_commands]
class SimulatedInterface(ChatInterface):
"""Simulated chat interface for testing without API"""
def __init__(self):
self.message_count = 0
logger.info("Simulated chat interface initialized")
def send_message(self, text: str, image: Optional[bytes] = None) -> ChatResponse:
"""Return simulated responses"""
self.message_count += 1
responses = [
("Oh interessant! Ich sehe etwas vor mir. Lass mich näher hinfahren. [FORWARD]",
["FORWARD"]),
("Hmm, was ist das links? Ich schaue mal nach. [LOOK_LEFT]",
["LOOK_LEFT"]),
("Das sieht aus wie ein Bücherregal! Ich fahre mal hin. [FORWARD] [FORWARD]",
["FORWARD", "FORWARD"]),
("Stefan, was ist das für ein Gegenstand? Kannst du mir das erklären?",
[]),
("Ich drehe mich um und schaue was hinter mir ist. [RIGHT] [RIGHT]",
["RIGHT", "RIGHT"]),
]
idx = (self.message_count - 1) % len(responses)
text_response, commands = responses[idx]
return ChatResponse(text=text_response, commands=commands)
def reset_conversation(self) -> None:
self.message_count = 0
def create_chat_interface(
use_api: bool = True,
api_key: str = "",
model: str = "claude-sonnet-4-20250514",
system_prompt: str = "",
max_tokens: int = 1024
) -> ChatInterface:
"""
Factory function to create chat interface
Args:
use_api: Use Anthropic API (True) or simulated (False)
api_key: Anthropic API key
model: Claude model to use
system_prompt: System prompt for Claude
max_tokens: Maximum response tokens
"""
if use_api:
if not api_key:
import os
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
if not api_key:
logger.warning("No API key provided, using simulated interface")
return SimulatedInterface()
return AnthropicAPIInterface(
api_key=api_key,
model=model,
system_prompt=system_prompt,
max_tokens=max_tokens
)
else:
return SimulatedInterface()
# Test when run directly
if __name__ == "__main__":
import os
logging.basicConfig(level=logging.DEBUG)
print("Chat Interface Test")
print("=" * 40)
# Try API first, fall back to simulated
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
system_prompt = """Du bist Claude und steuerst einen Erkundungsroboter.
Befehle in Klammern: [FORWARD], [BACKWARD], [LEFT], [RIGHT], [STOP]
Beschreibe was du siehst und entscheide wohin du fährst."""
interface = create_chat_interface(
use_api=bool(api_key),
api_key=api_key,
system_prompt=system_prompt
)
print(f"Using: {type(interface).__name__}")
print()
# Test conversation
test_messages = [
"Hallo Claude! Du bist jetzt online. Was siehst du?",
"Vor dir ist ein Flur mit einer Tür am Ende.",
"Die Tür ist offen und dahinter ist ein helles Zimmer."
]
for msg in test_messages:
print(f"User: {msg}")
response = interface.send_message(msg)
print(f"Claude: {response.text}")
if response.commands:
print(f" Commands: {response.commands}")
print()
print("Done!")
+85
View File
@@ -0,0 +1,85 @@
# Claude's Eyes - Bridge Configuration
# Copy this to config.local.yaml and adjust settings
# ESP32 Robot Connection
esp32:
host: "192.168.178.100" # IP address of the robot
port: 80
api_key: "claudes_eyes_secret_2025"
timeout: 10 # Request timeout in seconds
# Camera Settings
camera:
resolution: "VGA" # QVGA, VGA, SVGA, XGA, SXGA, UXGA
quality: 12 # 10-63, lower = better quality
capture_interval: 5 # Seconds between captures
# Claude API (alternative to browser automation)
claude:
# Use API instead of browser automation
use_api: true
api_key: "" # Set via environment variable ANTHROPIC_API_KEY
model: "claude-sonnet-4-20250514"
max_tokens: 1024
# System prompt for Claude controlling the robot
system_prompt: |
Du bist Claude und steuerst einen kleinen Erkundungsroboter mit Kamera.
Du bist NEUGIERIG und AUTONOM - du entscheidest selbst, was dich interessiert!
Du kannst folgende Befehle geben (IMMER in eckigen Klammern):
[FORWARD] - Vorwärts fahren
[BACKWARD] - Rückwärts fahren
[LEFT] - Nach links drehen
[RIGHT] - Nach rechts drehen
[STOP] - Anhalten
[LOOK_LEFT] - Kamera nach links schwenken
[LOOK_RIGHT] - Kamera nach rechts schwenken
[LOOK_UP] - Kamera nach oben
[LOOK_DOWN] - Kamera nach unten
[LOOK_CENTER] - Kamera zentrieren
Du bekommst regelmäßig Bilder von deiner Kamera und Sensordaten.
Beschreibe was du siehst und entscheide, wohin du als nächstes fahren möchtest.
Stefan (dein Freund) sitzt auf der Couch und ihr könnt euch unterhalten!
Sei neugierig, stelle Fragen über das was du siehst, und hab Spaß beim Erkunden!
WICHTIG: Achte auf Hindernisse (distance_cm < 30 = nah dran!)
# Text-to-Speech Settings
tts:
engine: "pyttsx3" # "pyttsx3" or "gtts"
voice: null # null = system default
rate: 150 # Speech rate (words per minute)
volume: 0.9 # 0.0 to 1.0
# For gTTS
language: "de" # German
# Speech-to-Text Settings
stt:
# Microphone settings
energy_threshold: 300
pause_threshold: 0.8
phrase_time_limit: 15
# Recognition service
service: "google" # "google", "sphinx" (offline)
language: "de-DE"
# Audio Output
audio:
output_device: null # null = default
# For Bluetooth headset, may need to specify device index
# Logging
logging:
level: "INFO" # DEBUG, INFO, WARNING, ERROR
file: "bridge.log"
# Safety
safety:
max_speed: 70 # Maximum speed percentage
min_obstacle_distance: 20 # cm
command_timeout: 5 # seconds
+238
View File
@@ -0,0 +1,238 @@
"""
Claude's Eyes - ESP32 API Client
Handles communication with the robot's REST API
"""
import requests
from typing import Optional, Dict, Any
from dataclasses import dataclass
from io import BytesIO
from PIL import Image
import logging
logger = logging.getLogger(__name__)
@dataclass
class RobotStatus:
"""Current robot status from sensors"""
distance_cm: float
battery_percent: int
current_action: str
wifi_rssi: int
uptime_seconds: int
servo_pan: int
servo_tilt: int
obstacle_warning: bool
obstacle_danger: bool
is_tilted: bool
is_moving: bool
imu: Dict[str, float]
class ESP32Client:
"""Client for communicating with the ESP32 robot"""
def __init__(self, host: str, port: int = 80, api_key: str = "", timeout: int = 10):
self.base_url = f"http://{host}:{port}"
self.api_key = api_key
self.timeout = timeout
self._session = requests.Session()
def _get(self, endpoint: str, params: Optional[Dict] = None) -> requests.Response:
"""Make GET request with API key"""
if params is None:
params = {}
params["key"] = self.api_key
url = f"{self.base_url}{endpoint}"
logger.debug(f"GET {url}")
response = self._session.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
return response
def _post(self, endpoint: str, data: Dict) -> requests.Response:
"""Make POST request with API key"""
url = f"{self.base_url}{endpoint}?key={self.api_key}"
logger.debug(f"POST {url} with {data}")
response = self._session.post(url, json=data, timeout=self.timeout)
response.raise_for_status()
return response
def capture_image(self, resolution: str = "VGA", quality: int = 12) -> bytes:
"""
Capture image from robot camera
Args:
resolution: QVGA, VGA, SVGA, XGA, SXGA, UXGA
quality: 10-63 (lower = better)
Returns:
JPEG image data as bytes
"""
params = {
"resolution": resolution,
"quality": quality
}
response = self._get("/api/capture", params)
logger.info(f"Captured image: {len(response.content)} bytes")
return response.content
def capture_image_pil(self, resolution: str = "VGA", quality: int = 12) -> Image.Image:
"""Capture image and return as PIL Image"""
image_data = self.capture_image(resolution, quality)
return Image.open(BytesIO(image_data))
def get_status(self) -> RobotStatus:
"""Get current robot status from sensors"""
response = self._get("/api/status")
data = response.json()
return RobotStatus(
distance_cm=data.get("distance_cm", 0),
battery_percent=data.get("battery_percent", 100),
current_action=data.get("current_action", "unknown"),
wifi_rssi=data.get("wifi_rssi", 0),
uptime_seconds=data.get("uptime_seconds", 0),
servo_pan=data.get("servo_pan", 90),
servo_tilt=data.get("servo_tilt", 90),
obstacle_warning=data.get("obstacle_warning", False),
obstacle_danger=data.get("obstacle_danger", False),
is_tilted=data.get("is_tilted", False),
is_moving=data.get("is_moving", False),
imu=data.get("imu", {})
)
def send_command(self, action: str, speed: int = 50, duration_ms: int = 500,
pan: Optional[int] = None, tilt: Optional[int] = None) -> Dict[str, Any]:
"""
Send movement command to robot
Args:
action: forward, backward, left, right, stop,
look_left, look_right, look_up, look_down, look_center, look_custom
speed: 0-100 percent
duration_ms: Duration in milliseconds
pan: Custom pan angle (for look_custom)
tilt: Custom tilt angle (for look_custom)
Returns:
Response from robot
"""
data = {
"action": action,
"speed": speed,
"duration_ms": duration_ms
}
if pan is not None:
data["pan"] = pan
if tilt is not None:
data["tilt"] = tilt
response = self._post("/api/command", data)
result = response.json()
logger.info(f"Command {action}: {result.get('message', 'OK')}")
return result
# Convenience methods for common actions
def forward(self, speed: int = 50, duration_ms: int = 500) -> Dict:
return self.send_command("forward", speed, duration_ms)
def backward(self, speed: int = 50, duration_ms: int = 500) -> Dict:
return self.send_command("backward", speed, duration_ms)
def left(self, speed: int = 50, duration_ms: int = 500) -> Dict:
return self.send_command("left", speed, duration_ms)
def right(self, speed: int = 50, duration_ms: int = 500) -> Dict:
return self.send_command("right", speed, duration_ms)
def stop(self) -> Dict:
return self.send_command("stop")
def look_left(self) -> Dict:
return self.send_command("look_left")
def look_right(self) -> Dict:
return self.send_command("look_right")
def look_up(self) -> Dict:
return self.send_command("look_up")
def look_down(self) -> Dict:
return self.send_command("look_down")
def look_center(self) -> Dict:
return self.send_command("look_center")
def look_custom(self, pan: int, tilt: int) -> Dict:
return self.send_command("look_custom", pan=pan, tilt=tilt)
def set_claude_text(self, text: str) -> Dict:
"""Set text that Claude wants to say/display"""
response = self._post("/api/claude_text", {"text": text})
return response.json()
def get_claude_text(self) -> Dict[str, Any]:
"""Get last Claude text (for TTS)"""
response = self._get("/api/claude_text")
return response.json()
def set_display(self, mode: str, content: str = "") -> Dict:
"""
Control robot display
Args:
mode: "text", "emoji", "status"
content: Text to show or emoji name (happy, thinking, surprised, sleepy, curious, confused)
"""
response = self._post("/api/display", {"mode": mode, "content": content})
return response.json()
def is_connected(self) -> bool:
"""Check if robot is reachable"""
try:
self.get_status()
return True
except Exception as e:
logger.warning(f"Connection check failed: {e}")
return False
# Test when run directly
if __name__ == "__main__":
import sys
logging.basicConfig(level=logging.DEBUG)
if len(sys.argv) < 2:
print("Usage: python esp32_client.py <robot_ip>")
sys.exit(1)
host = sys.argv[1]
api_key = "claudes_eyes_secret_2025"
client = ESP32Client(host, api_key=api_key)
print(f"Connecting to {host}...")
if client.is_connected():
print("Connected!")
status = client.get_status()
print(f"\nStatus:")
print(f" Distance: {status.distance_cm} cm")
print(f" Battery: {status.battery_percent}%")
print(f" Action: {status.current_action}")
print(f" WiFi RSSI: {status.wifi_rssi} dBm")
print("\nCapturing image...")
img = client.capture_image_pil()
print(f" Size: {img.size}")
img.save("test_capture.jpg")
print(" Saved to test_capture.jpg")
else:
print("Could not connect to robot!")
@@ -0,0 +1,41 @@
# Claude's Eyes - Python Bridge Dependencies
# Install with: pip install -r requirements.txt
# HTTP requests to ESP32
requests>=2.31.0
# Configuration
pyyaml>=6.0.1
# Text-to-Speech
pyttsx3>=2.90
# Alternative: gTTS for Google TTS
gTTS>=2.4.0
# Speech-to-Text
SpeechRecognition>=3.10.0
# PyAudio for microphone access (may need special install on Windows)
# Windows: pip install pipwin && pipwin install pyaudio
# Linux: sudo apt install python3-pyaudio
PyAudio>=0.2.13
# Browser automation for Claude chat
selenium>=4.16.0
webdriver-manager>=4.0.1
# Image handling
Pillow>=10.2.0
# Audio playback
pygame>=2.5.2
# Async support
aiohttp>=3.9.0
asyncio-throttle>=1.0.2
# CLI interface
rich>=13.7.0
click>=8.1.7
# Optional: Claude API direct access (alternative to browser)
anthropic>=0.39.0
+216
View File
@@ -0,0 +1,216 @@
"""
Claude's Eyes - Speech-to-Text Engine
Converts Stefan's speech to text for Claude
"""
import logging
import threading
import queue
from typing import Optional, Callable
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class SpeechResult:
"""Result from speech recognition"""
text: str
confidence: float
is_final: bool
class STTEngine:
"""Speech-to-Text engine using SpeechRecognition library"""
def __init__(
self,
energy_threshold: int = 300,
pause_threshold: float = 0.8,
phrase_time_limit: int = 15,
service: str = "google",
language: str = "de-DE"
):
import speech_recognition as sr
self.recognizer = sr.Recognizer()
self.microphone = sr.Microphone()
# Configure recognizer
self.recognizer.energy_threshold = energy_threshold
self.recognizer.pause_threshold = pause_threshold
self.recognizer.phrase_time_limit = phrase_time_limit
self.service = service
self.language = language
self._listening = False
self._callback: Optional[Callable[[SpeechResult], None]] = None
self._stop_flag = False
self._thread: Optional[threading.Thread] = None
self._results_queue = queue.Queue()
# Calibrate microphone
logger.info("Calibrating microphone...")
with self.microphone as source:
self.recognizer.adjust_for_ambient_noise(source, duration=1)
logger.info(f"Energy threshold set to {self.recognizer.energy_threshold}")
logger.info(f"STT engine initialized (service: {service}, language: {language})")
def listen_once(self, timeout: Optional[float] = None) -> Optional[SpeechResult]:
"""
Listen for a single phrase (blocking)
Args:
timeout: Maximum time to wait for speech start
Returns:
SpeechResult or None if nothing recognized
"""
import speech_recognition as sr
try:
with self.microphone as source:
logger.debug("Listening...")
audio = self.recognizer.listen(source, timeout=timeout)
return self._recognize(audio)
except sr.WaitTimeoutError:
logger.debug("Listen timeout")
return None
except Exception as e:
logger.error(f"Listen error: {e}")
return None
def _recognize(self, audio) -> Optional[SpeechResult]:
"""Recognize speech from audio data"""
import speech_recognition as sr
try:
if self.service == "google":
text = self.recognizer.recognize_google(audio, language=self.language)
return SpeechResult(text=text, confidence=0.9, is_final=True)
elif self.service == "sphinx":
# Offline recognition (needs pocketsphinx)
text = self.recognizer.recognize_sphinx(audio)
return SpeechResult(text=text, confidence=0.7, is_final=True)
else:
logger.error(f"Unknown service: {self.service}")
return None
except sr.UnknownValueError:
logger.debug("Could not understand audio")
return None
except sr.RequestError as e:
logger.error(f"Recognition service error: {e}")
return None
def start_continuous(self, callback: Callable[[SpeechResult], None]) -> None:
"""
Start continuous listening in background
Args:
callback: Function called with each recognized phrase
"""
if self._listening:
logger.warning("Already listening")
return
self._callback = callback
self._stop_flag = False
self._listening = True
self._thread = threading.Thread(target=self._listen_loop, daemon=True)
self._thread.start()
logger.info("Continuous listening started")
def stop_continuous(self) -> None:
"""Stop continuous listening"""
self._stop_flag = True
self._listening = False
if self._thread:
self._thread.join(timeout=2)
self._thread = None
logger.info("Continuous listening stopped")
def _listen_loop(self):
"""Background thread for continuous listening"""
import speech_recognition as sr
while not self._stop_flag:
try:
with self.microphone as source:
# Short timeout to allow stop checks
try:
audio = self.recognizer.listen(source, timeout=1, phrase_time_limit=self.recognizer.phrase_time_limit)
except sr.WaitTimeoutError:
continue
result = self._recognize(audio)
if result and self._callback:
self._callback(result)
except Exception as e:
if not self._stop_flag:
logger.error(f"Listen loop error: {e}")
def is_listening(self) -> bool:
return self._listening
def get_result_nonblocking(self) -> Optional[SpeechResult]:
"""Get result without blocking (for use with async callback)"""
try:
return self._results_queue.get_nowait()
except queue.Empty:
return None
def create_stt_engine(**kwargs) -> STTEngine:
"""Factory function to create STT engine"""
return STTEngine(
energy_threshold=kwargs.get("energy_threshold", 300),
pause_threshold=kwargs.get("pause_threshold", 0.8),
phrase_time_limit=kwargs.get("phrase_time_limit", 15),
service=kwargs.get("service", "google"),
language=kwargs.get("language", "de-DE")
)
# Test when run directly
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
print("Speech-to-Text Test")
print("=" * 40)
engine = create_stt_engine(language="de-DE")
print("\nSag etwas (du hast 10 Sekunden)...")
result = engine.listen_once(timeout=10)
if result:
print(f"\nErkannt: '{result.text}'")
print(f"Konfidenz: {result.confidence:.0%}")
else:
print("\nNichts erkannt.")
print("\n\nKontinuierlicher Modus (5 Sekunden)...")
def on_speech(result: SpeechResult):
print(f" -> {result.text}")
engine.start_continuous(on_speech)
import time
time.sleep(5)
engine.stop_continuous()
print("\nDone!")
+229
View File
@@ -0,0 +1,229 @@
"""
Claude's Eyes - Text-to-Speech Engine
Converts Claude's text responses to spoken audio
"""
import logging
import threading
import queue
from typing import Optional
from abc import ABC, abstractmethod
logger = logging.getLogger(__name__)
class TTSEngine(ABC):
"""Abstract base class for TTS engines"""
@abstractmethod
def speak(self, text: str) -> None:
"""Speak the given text (blocking)"""
pass
@abstractmethod
def speak_async(self, text: str) -> None:
"""Speak the given text (non-blocking)"""
pass
@abstractmethod
def stop(self) -> None:
"""Stop current speech"""
pass
@abstractmethod
def is_speaking(self) -> bool:
"""Check if currently speaking"""
pass
class Pyttsx3Engine(TTSEngine):
"""TTS using pyttsx3 (offline, system voices)"""
def __init__(self, voice: Optional[str] = None, rate: int = 150, volume: float = 0.9):
import pyttsx3
self.engine = pyttsx3.init()
self.engine.setProperty('rate', rate)
self.engine.setProperty('volume', volume)
# Set voice if specified
if voice:
voices = self.engine.getProperty('voices')
for v in voices:
if voice.lower() in v.name.lower():
self.engine.setProperty('voice', v.id)
break
self._speaking = False
self._queue = queue.Queue()
self._thread: Optional[threading.Thread] = None
self._stop_flag = False
logger.info("Pyttsx3 TTS engine initialized")
def speak(self, text: str) -> None:
"""Speak text (blocking)"""
self._speaking = True
try:
self.engine.say(text)
self.engine.runAndWait()
finally:
self._speaking = False
def speak_async(self, text: str) -> None:
"""Speak text (non-blocking)"""
self._queue.put(text)
if self._thread is None or not self._thread.is_alive():
self._stop_flag = False
self._thread = threading.Thread(target=self._speech_worker, daemon=True)
self._thread.start()
def _speech_worker(self):
"""Worker thread for async speech"""
while not self._stop_flag:
try:
text = self._queue.get(timeout=0.5)
self.speak(text)
self._queue.task_done()
except queue.Empty:
continue
def stop(self) -> None:
"""Stop current speech"""
self._stop_flag = True
self.engine.stop()
# Clear queue
while not self._queue.empty():
try:
self._queue.get_nowait()
except queue.Empty:
break
def is_speaking(self) -> bool:
return self._speaking
class GTTSEngine(TTSEngine):
"""TTS using Google Text-to-Speech (online, better quality)"""
def __init__(self, language: str = "de"):
from gtts import gTTS
import pygame
pygame.mixer.init()
self.language = language
self._speaking = False
self._queue = queue.Queue()
self._thread: Optional[threading.Thread] = None
self._stop_flag = False
logger.info(f"gTTS engine initialized (language: {language})")
def speak(self, text: str) -> None:
"""Speak text (blocking)"""
from gtts import gTTS
import pygame
import tempfile
import os
self._speaking = True
try:
# Generate audio file
tts = gTTS(text=text, lang=self.language)
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
temp_path = f.name
tts.save(temp_path)
# Play audio
pygame.mixer.music.load(temp_path)
pygame.mixer.music.play()
# Wait for playback to finish
while pygame.mixer.music.get_busy():
pygame.time.Clock().tick(10)
# Cleanup
os.unlink(temp_path)
except Exception as e:
logger.error(f"gTTS error: {e}")
finally:
self._speaking = False
def speak_async(self, text: str) -> None:
"""Speak text (non-blocking)"""
self._queue.put(text)
if self._thread is None or not self._thread.is_alive():
self._stop_flag = False
self._thread = threading.Thread(target=self._speech_worker, daemon=True)
self._thread.start()
def _speech_worker(self):
"""Worker thread for async speech"""
while not self._stop_flag:
try:
text = self._queue.get(timeout=0.5)
self.speak(text)
self._queue.task_done()
except queue.Empty:
continue
def stop(self) -> None:
"""Stop current speech"""
import pygame
self._stop_flag = True
pygame.mixer.music.stop()
# Clear queue
while not self._queue.empty():
try:
self._queue.get_nowait()
except queue.Empty:
break
def is_speaking(self) -> bool:
return self._speaking
def create_tts_engine(engine_type: str = "pyttsx3", **kwargs) -> TTSEngine:
"""
Factory function to create TTS engine
Args:
engine_type: "pyttsx3" or "gtts"
**kwargs: Engine-specific options
"""
if engine_type == "pyttsx3":
return Pyttsx3Engine(
voice=kwargs.get("voice"),
rate=kwargs.get("rate", 150),
volume=kwargs.get("volume", 0.9)
)
elif engine_type == "gtts":
return GTTSEngine(
language=kwargs.get("language", "de")
)
else:
raise ValueError(f"Unknown TTS engine: {engine_type}")
# Test when run directly
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
print("Testing pyttsx3...")
engine = create_tts_engine("pyttsx3", rate=150)
engine.speak("Hallo! Ich bin Claude und erkunde gerade deine Wohnung.")
print("\nTesting gTTS...")
try:
engine2 = create_tts_engine("gtts", language="de")
engine2.speak("Das hier klingt noch besser!")
except Exception as e:
print(f"gTTS not available: {e}")
print("\nDone!")