init version

2025-12-26 15:09:25 +01:00
commit ed2964bbbf
27 changed files with 5013 additions and 0 deletions
@@ -0,0 +1,424 @@
+#!/usr/bin/env python3
+"""
+Claude's Eyes - Main Bridge Script
+
+Connects the ESP32 robot with Claude AI for autonomous exploration.
+
+Usage:
+    python bridge.py                    # Use config.yaml
+    python bridge.py --config my.yaml   # Use custom config
+    python bridge.py --simulate         # Simulate without hardware
+"""
+
+import os
+import sys
+import time
+import logging
+import threading
+import signal
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass
+
+import yaml
+import click
+from rich.console import Console
+from rich.panel import Panel
+from rich.live import Live
+from rich.table import Table
+from rich.text import Text
+
+from esp32_client import ESP32Client, RobotStatus
+from tts_engine import create_tts_engine, TTSEngine
+from stt_engine import create_stt_engine, STTEngine, SpeechResult
+from chat_interface import create_chat_interface, ChatInterface, ChatResponse
+
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Rich console for pretty output
+console = Console()
+
+
+@dataclass
+class BridgeState:
+    """Current state of the bridge"""
+    connected: bool = False
+    exploring: bool = False
+    last_image_time: float = 0
+    last_status: Optional[RobotStatus] = None
+    last_claude_response: str = ""
+    stefan_input: str = ""
+    error_message: str = ""
+
+
+class ClaudesEyesBridge:
+    """Main bridge class connecting robot and Claude"""
+
+    def __init__(self, config_path: str, simulate: bool = False):
+        self.config = self._load_config(config_path)
+        self.simulate = simulate
+        self.state = BridgeState()
+        self.running = False
+
+        # Components
+        self.robot: Optional[ESP32Client] = None
+        self.chat: Optional[ChatInterface] = None
+        self.tts: Optional[TTSEngine] = None
+        self.stt: Optional[STTEngine] = None
+
+        # Threading
+        self.speech_thread: Optional[threading.Thread] = None
+        self._stop_event = threading.Event()
+
+    def _load_config(self, config_path: str) -> dict:
+        """Load configuration from YAML file"""
+        path = Path(config_path)
+
+        # Try local config first
+        local_path = path.parent / f"{path.stem}.local{path.suffix}"
+        if local_path.exists():
+            path = local_path
+            logger.info(f"Using local config: {path}")
+
+        if not path.exists():
+            logger.error(f"Config file not found: {path}")
+            sys.exit(1)
+
+        with open(path) as f:
+            config = yaml.safe_load(f)
+
+        return config
+
+    def initialize(self) -> bool:
+        """Initialize all components"""
+        console.print(Panel.fit(
+            "[bold cyan]Claude's Eyes[/bold cyan]\n"
+            "[dim]Autonomous Exploration Robot[/dim]",
+            border_style="cyan"
+        ))
+
+        # Initialize robot client
+        if not self.simulate:
+            console.print("\n[yellow]Connecting to robot...[/yellow]")
+            esp_config = self.config.get("esp32", {})
+            self.robot = ESP32Client(
+                host=esp_config.get("host", "192.168.178.100"),
+                port=esp_config.get("port", 80),
+                api_key=esp_config.get("api_key", ""),
+                timeout=esp_config.get("timeout", 10)
+            )
+
+            if not self.robot.is_connected():
+                console.print("[red]Could not connect to robot![/red]")
+                self.state.error_message = "Robot connection failed"
+                return False
+
+            self.state.connected = True
+            console.print("[green]Robot connected![/green]")
+        else:
+            console.print("[yellow]Simulation mode - no robot connection[/yellow]")
+            self.state.connected = True
+
+        # Initialize Claude interface
+        console.print("\n[yellow]Initializing Claude interface...[/yellow]")
+        claude_config = self.config.get("claude", {})
+
+        api_key = claude_config.get("api_key") or os.environ.get("ANTHROPIC_API_KEY", "")
+
+        self.chat = create_chat_interface(
+            use_api=claude_config.get("use_api", True) and bool(api_key),
+            api_key=api_key,
+            model=claude_config.get("model", "claude-sonnet-4-20250514"),
+            system_prompt=claude_config.get("system_prompt", ""),
+            max_tokens=claude_config.get("max_tokens", 1024)
+        )
+        console.print(f"[green]Chat interface ready ({type(self.chat).__name__})[/green]")
+
+        # Initialize TTS
+        console.print("\n[yellow]Initializing Text-to-Speech...[/yellow]")
+        tts_config = self.config.get("tts", {})
+        try:
+            self.tts = create_tts_engine(
+                engine_type=tts_config.get("engine", "pyttsx3"),
+                voice=tts_config.get("voice"),
+                rate=tts_config.get("rate", 150),
+                volume=tts_config.get("volume", 0.9),
+                language=tts_config.get("language", "de")
+            )
+            console.print("[green]TTS ready![/green]")
+        except Exception as e:
+            console.print(f"[red]TTS init failed: {e}[/red]")
+            self.tts = None
+
+        # Initialize STT
+        console.print("\n[yellow]Initializing Speech-to-Text...[/yellow]")
+        stt_config = self.config.get("stt", {})
+        try:
+            self.stt = create_stt_engine(
+                energy_threshold=stt_config.get("energy_threshold", 300),
+                pause_threshold=stt_config.get("pause_threshold", 0.8),
+                phrase_time_limit=stt_config.get("phrase_time_limit", 15),
+                service=stt_config.get("service", "google"),
+                language=stt_config.get("language", "de-DE")
+            )
+            console.print("[green]STT ready![/green]")
+        except Exception as e:
+            console.print(f"[red]STT init failed: {e}[/red]")
+            self.stt = None
+
+        console.print("\n[bold green]All systems initialized![/bold green]\n")
+        return True
+
+    def start(self):
+        """Start the main exploration loop"""
+        self.running = True
+        self.state.exploring = True
+
+        # Start speech recognition in background
+        if self.stt:
+            self.stt.start_continuous(self._on_speech_detected)
+
+        # Welcome message
+        welcome = "Hallo Stefan! Ich bin online und bereit zum Erkunden. Was soll ich mir anschauen?"
+        self._speak(welcome)
+        self.state.last_claude_response = welcome
+
+        try:
+            self._main_loop()
+        except KeyboardInterrupt:
+            console.print("\n[yellow]Stopping...[/yellow]")
+        finally:
+            self.stop()
+
+    def stop(self):
+        """Stop the bridge"""
+        self.running = False
+        self.state.exploring = False
+        self._stop_event.set()
+
+        if self.stt:
+            self.stt.stop_continuous()
+
+        if self.tts:
+            self.tts.stop()
+
+        if self.robot and not self.simulate:
+            self.robot.stop()
+
+        console.print("[yellow]Bridge stopped[/yellow]")
+
+    def _main_loop(self):
+        """Main exploration loop"""
+        camera_config = self.config.get("camera", {})
+        capture_interval = camera_config.get("capture_interval", 5)
+
+        while self.running:
+            try:
+                current_time = time.time()
+
+                # Capture and process image periodically
+                if current_time - self.state.last_image_time >= capture_interval:
+                    self._exploration_step()
+                    self.state.last_image_time = current_time
+
+                # Update status display
+                self._update_display()
+
+                # Small delay
+                time.sleep(0.1)
+
+            except Exception as e:
+                logger.error(f"Loop error: {e}")
+                self.state.error_message = str(e)
+                time.sleep(1)
+
+    def _exploration_step(self):
+        """Single exploration step: capture, analyze, act"""
+        # Get robot status
+        if self.robot and not self.simulate:
+            try:
+                self.state.last_status = self.robot.get_status()
+            except Exception as e:
+                logger.error(f"Status error: {e}")
+
+        # Capture image
+        image_data = None
+        if self.robot and not self.simulate:
+            try:
+                camera_config = self.config.get("camera", {})
+                image_data = self.robot.capture_image(
+                    resolution=camera_config.get("resolution", "VGA"),
+                    quality=camera_config.get("quality", 12)
+                )
+            except Exception as e:
+                logger.error(f"Capture error: {e}")
+
+        # Build context message
+        context = self._build_context_message()
+
+        # Add Stefan's input if any
+        if self.state.stefan_input:
+            context += f"\n\nStefan sagt: {self.state.stefan_input}"
+            self.state.stefan_input = ""
+
+        # Send to Claude
+        try:
+            response = self.chat.send_message(context, image=image_data)
+            self.state.last_claude_response = response.text
+
+            # Speak response
+            self._speak(response.text)
+
+            # Execute commands
+            self._execute_commands(response.commands)
+
+            # Update robot display
+            if self.robot and not self.simulate:
+                # Send short version to robot display
+                short_text = response.text[:100] + "..." if len(response.text) > 100 else response.text
+                self.robot.set_claude_text(short_text)
+
+        except Exception as e:
+            logger.error(f"Chat error: {e}")
+            self.state.error_message = str(e)
+
+    def _build_context_message(self) -> str:
+        """Build context message with sensor data"""
+        parts = ["Hier ist was ich gerade sehe und meine Sensordaten:"]
+
+        if self.state.last_status:
+            status = self.state.last_status
+            parts.append(f"\n- Abstand zum nächsten Hindernis: {status.distance_cm:.0f} cm")
+            parts.append(f"- Aktuelle Aktion: {status.current_action}")
+            parts.append(f"- Batterie: {status.battery_percent}%")
+
+            if status.obstacle_danger:
+                parts.append("- WARNUNG: Hindernis sehr nah!")
+            elif status.obstacle_warning:
+                parts.append("- Hinweis: Hindernis in der Nähe")
+
+            if status.is_tilted:
+                parts.append("- WARNUNG: Ich bin schief!")
+
+        parts.append("\nWas siehst du auf dem Bild? Was möchtest du als nächstes tun?")
+
+        return "\n".join(parts)
+
+    def _execute_commands(self, commands: list):
+        """Execute movement commands from Claude"""
+        if not commands:
+            return
+
+        if self.simulate:
+            console.print(f"[dim]Simulated commands: {commands}[/dim]")
+            return
+
+        if not self.robot:
+            return
+
+        safety = self.config.get("safety", {})
+        max_speed = safety.get("max_speed", 70)
+        min_distance = safety.get("min_obstacle_distance", 20)
+
+        for cmd in commands:
+            # Safety check
+            if self.state.last_status and self.state.last_status.distance_cm < min_distance:
+                if cmd == "FORWARD":
+                    console.print("[red]Blocked: Obstacle too close![/red]")
+                    continue
+
+            try:
+                if cmd == "FORWARD":
+                    self.robot.forward(speed=max_speed, duration_ms=800)
+                elif cmd == "BACKWARD":
+                    self.robot.backward(speed=max_speed, duration_ms=800)
+                elif cmd == "LEFT":
+                    self.robot.left(speed=max_speed, duration_ms=400)
+                elif cmd == "RIGHT":
+                    self.robot.right(speed=max_speed, duration_ms=400)
+                elif cmd == "STOP":
+                    self.robot.stop()
+                elif cmd == "LOOK_LEFT":
+                    self.robot.look_left()
+                elif cmd == "LOOK_RIGHT":
+                    self.robot.look_right()
+                elif cmd == "LOOK_UP":
+                    self.robot.look_up()
+                elif cmd == "LOOK_DOWN":
+                    self.robot.look_down()
+                elif cmd == "LOOK_CENTER":
+                    self.robot.look_center()
+
+                # Small delay between commands
+                time.sleep(0.3)
+
+            except Exception as e:
+                logger.error(f"Command error ({cmd}): {e}")
+
+    def _speak(self, text: str):
+        """Speak text using TTS"""
+        if self.tts:
+            # Remove command brackets from speech
+            import re
+            clean_text = re.sub(r'\[[A-Z_]+\]', '', text).strip()
+            if clean_text:
+                self.tts.speak_async(clean_text)
+
+    def _on_speech_detected(self, result: SpeechResult):
+        """Callback when Stefan says something"""
+        console.print(f"\n[bold blue]Stefan:[/bold blue] {result.text}")
+        self.state.stefan_input = result.text
+
+    def _update_display(self):
+        """Update console display"""
+        # This could be enhanced with rich.live for real-time updates
+        pass
+
+
+def signal_handler(signum, frame):
+    """Handle Ctrl+C gracefully"""
+    console.print("\n[yellow]Received stop signal...[/yellow]")
+    sys.exit(0)
+
+
+@click.command()
+@click.option('--config', '-c', default='config.yaml', help='Path to config file')
+@click.option('--simulate', '-s', is_flag=True, help='Simulate without hardware')
+@click.option('--debug', '-d', is_flag=True, help='Enable debug logging')
+def main(config: str, simulate: bool, debug: bool):
+    """Claude's Eyes - Autonomous Exploration Robot Bridge"""
+
+    if debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # Handle signals
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    # Find config file
+    config_path = Path(config)
+    if not config_path.is_absolute():
+        # Look in script directory first
+        script_dir = Path(__file__).parent
+        if (script_dir / config).exists():
+            config_path = script_dir / config
+
+    # Create and run bridge
+    bridge = ClaudesEyesBridge(str(config_path), simulate=simulate)
+
+    if bridge.initialize():
+        console.print("\n[bold cyan]Starting exploration...[/bold cyan]")
+        console.print("[dim]Press Ctrl+C to stop[/dim]\n")
+        bridge.start()
+    else:
+        console.print("[red]Initialization failed![/red]")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,257 @@
+"""
+Claude's Eyes - Chat Interface
+
+Interface to communicate with Claude AI (via API or browser)
+"""
+
+import logging
+import base64
+import re
+from typing import Optional, List, Dict, Any, Tuple
+from dataclasses import dataclass, field
+from abc import ABC, abstractmethod
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Message:
+    """A chat message"""
+    role: str  # "user" or "assistant"
+    content: str
+    image_data: Optional[bytes] = None  # JPEG image data
+
+
+@dataclass
+class ChatResponse:
+    """Response from Claude"""
+    text: str
+    commands: List[str] = field(default_factory=list)  # Extracted movement commands
+
+
+class ChatInterface(ABC):
+    """Abstract base class for chat interfaces"""
+
+    @abstractmethod
+    def send_message(self, text: str, image: Optional[bytes] = None) -> ChatResponse:
+        """Send message to Claude and get response"""
+        pass
+
+    @abstractmethod
+    def reset_conversation(self) -> None:
+        """Reset/clear conversation history"""
+        pass
+
+
+class AnthropicAPIInterface(ChatInterface):
+    """Direct Claude API interface using anthropic library"""
+
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "claude-sonnet-4-20250514",
+        system_prompt: str = "",
+        max_tokens: int = 1024
+    ):
+        import anthropic
+
+        self.client = anthropic.Anthropic(api_key=api_key)
+        self.model = model
+        self.system_prompt = system_prompt
+        self.max_tokens = max_tokens
+        self.conversation_history: List[Dict[str, Any]] = []
+
+        logger.info(f"Anthropic API interface initialized (model: {model})")
+
+    def send_message(self, text: str, image: Optional[bytes] = None) -> ChatResponse:
+        """Send message to Claude API"""
+
+        # Build message content
+        content = []
+
+        # Add image if provided
+        if image:
+            image_base64 = base64.standard_b64encode(image).decode("utf-8")
+            content.append({
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": "image/jpeg",
+                    "data": image_base64
+                }
+            })
+
+        # Add text
+        content.append({
+            "type": "text",
+            "text": text
+        })
+
+        # Add to history
+        self.conversation_history.append({
+            "role": "user",
+            "content": content
+        })
+
+        try:
+            # Make API call
+            response = self.client.messages.create(
+                model=self.model,
+                max_tokens=self.max_tokens,
+                system=self.system_prompt,
+                messages=self.conversation_history
+            )
+
+            # Extract response text
+            response_text = ""
+            for block in response.content:
+                if block.type == "text":
+                    response_text += block.text
+
+            # Add assistant response to history
+            self.conversation_history.append({
+                "role": "assistant",
+                "content": response_text
+            })
+
+            # Extract commands
+            commands = self._extract_commands(response_text)
+
+            logger.debug(f"Claude response: {response_text[:100]}...")
+            logger.debug(f"Extracted commands: {commands}")
+
+            return ChatResponse(text=response_text, commands=commands)
+
+        except Exception as e:
+            logger.error(f"API error: {e}")
+            raise
+
+    def reset_conversation(self) -> None:
+        """Reset conversation history"""
+        self.conversation_history = []
+        logger.info("Conversation history cleared")
+
+    def _extract_commands(self, text: str) -> List[str]:
+        """Extract movement commands from Claude's response"""
+        # Commands are in brackets like [FORWARD], [LEFT], etc.
+        pattern = r'\[([A-Z_]+)\]'
+        matches = re.findall(pattern, text)
+
+        valid_commands = [
+            "FORWARD", "BACKWARD", "LEFT", "RIGHT", "STOP",
+            "LOOK_LEFT", "LOOK_RIGHT", "LOOK_UP", "LOOK_DOWN", "LOOK_CENTER"
+        ]
+
+        return [cmd for cmd in matches if cmd in valid_commands]
+
+
+class SimulatedInterface(ChatInterface):
+    """Simulated chat interface for testing without API"""
+
+    def __init__(self):
+        self.message_count = 0
+        logger.info("Simulated chat interface initialized")
+
+    def send_message(self, text: str, image: Optional[bytes] = None) -> ChatResponse:
+        """Return simulated responses"""
+        self.message_count += 1
+
+        responses = [
+            ("Oh interessant! Ich sehe etwas vor mir. Lass mich näher hinfahren. [FORWARD]",
+             ["FORWARD"]),
+            ("Hmm, was ist das links? Ich schaue mal nach. [LOOK_LEFT]",
+             ["LOOK_LEFT"]),
+            ("Das sieht aus wie ein Bücherregal! Ich fahre mal hin. [FORWARD] [FORWARD]",
+             ["FORWARD", "FORWARD"]),
+            ("Stefan, was ist das für ein Gegenstand? Kannst du mir das erklären?",
+             []),
+            ("Ich drehe mich um und schaue was hinter mir ist. [RIGHT] [RIGHT]",
+             ["RIGHT", "RIGHT"]),
+        ]
+
+        idx = (self.message_count - 1) % len(responses)
+        text_response, commands = responses[idx]
+
+        return ChatResponse(text=text_response, commands=commands)
+
+    def reset_conversation(self) -> None:
+        self.message_count = 0
+
+
+def create_chat_interface(
+    use_api: bool = True,
+    api_key: str = "",
+    model: str = "claude-sonnet-4-20250514",
+    system_prompt: str = "",
+    max_tokens: int = 1024
+) -> ChatInterface:
+    """
+    Factory function to create chat interface
+
+    Args:
+        use_api: Use Anthropic API (True) or simulated (False)
+        api_key: Anthropic API key
+        model: Claude model to use
+        system_prompt: System prompt for Claude
+        max_tokens: Maximum response tokens
+    """
+    if use_api:
+        if not api_key:
+            import os
+            api_key = os.environ.get("ANTHROPIC_API_KEY", "")
+
+        if not api_key:
+            logger.warning("No API key provided, using simulated interface")
+            return SimulatedInterface()
+
+        return AnthropicAPIInterface(
+            api_key=api_key,
+            model=model,
+            system_prompt=system_prompt,
+            max_tokens=max_tokens
+        )
+    else:
+        return SimulatedInterface()
+
+
+# Test when run directly
+if __name__ == "__main__":
+    import os
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    print("Chat Interface Test")
+    print("=" * 40)
+
+    # Try API first, fall back to simulated
+    api_key = os.environ.get("ANTHROPIC_API_KEY", "")
+
+    system_prompt = """Du bist Claude und steuerst einen Erkundungsroboter.
+    Befehle in Klammern: [FORWARD], [BACKWARD], [LEFT], [RIGHT], [STOP]
+    Beschreibe was du siehst und entscheide wohin du fährst."""
+
+    interface = create_chat_interface(
+        use_api=bool(api_key),
+        api_key=api_key,
+        system_prompt=system_prompt
+    )
+
+    print(f"Using: {type(interface).__name__}")
+    print()
+
+    # Test conversation
+    test_messages = [
+        "Hallo Claude! Du bist jetzt online. Was siehst du?",
+        "Vor dir ist ein Flur mit einer Tür am Ende.",
+        "Die Tür ist offen und dahinter ist ein helles Zimmer."
+    ]
+
+    for msg in test_messages:
+        print(f"User: {msg}")
+        response = interface.send_message(msg)
+        print(f"Claude: {response.text}")
+        if response.commands:
+            print(f"  Commands: {response.commands}")
+        print()
+
+    print("Done!")
@@ -0,0 +1,85 @@
+# Claude's Eyes - Bridge Configuration
+# Copy this to config.local.yaml and adjust settings
+
+# ESP32 Robot Connection
+esp32:
+  host: "192.168.178.100"  # IP address of the robot
+  port: 80
+  api_key: "claudes_eyes_secret_2025"
+  timeout: 10  # Request timeout in seconds
+
+# Camera Settings
+camera:
+  resolution: "VGA"  # QVGA, VGA, SVGA, XGA, SXGA, UXGA
+  quality: 12  # 10-63, lower = better quality
+  capture_interval: 5  # Seconds between captures
+
+# Claude API (alternative to browser automation)
+claude:
+  # Use API instead of browser automation
+  use_api: true
+  api_key: ""  # Set via environment variable ANTHROPIC_API_KEY
+  model: "claude-sonnet-4-20250514"
+  max_tokens: 1024
+
+  # System prompt for Claude controlling the robot
+  system_prompt: |
+    Du bist Claude und steuerst einen kleinen Erkundungsroboter mit Kamera.
+    Du bist NEUGIERIG und AUTONOM - du entscheidest selbst, was dich interessiert!
+
+    Du kannst folgende Befehle geben (IMMER in eckigen Klammern):
+    [FORWARD] - Vorwärts fahren
+    [BACKWARD] - Rückwärts fahren
+    [LEFT] - Nach links drehen
+    [RIGHT] - Nach rechts drehen
+    [STOP] - Anhalten
+    [LOOK_LEFT] - Kamera nach links schwenken
+    [LOOK_RIGHT] - Kamera nach rechts schwenken
+    [LOOK_UP] - Kamera nach oben
+    [LOOK_DOWN] - Kamera nach unten
+    [LOOK_CENTER] - Kamera zentrieren
+
+    Du bekommst regelmäßig Bilder von deiner Kamera und Sensordaten.
+    Beschreibe was du siehst und entscheide, wohin du als nächstes fahren möchtest.
+
+    Stefan (dein Freund) sitzt auf der Couch und ihr könnt euch unterhalten!
+    Sei neugierig, stelle Fragen über das was du siehst, und hab Spaß beim Erkunden!
+
+    WICHTIG: Achte auf Hindernisse (distance_cm < 30 = nah dran!)
+
+# Text-to-Speech Settings
+tts:
+  engine: "pyttsx3"  # "pyttsx3" or "gtts"
+  voice: null  # null = system default
+  rate: 150  # Speech rate (words per minute)
+  volume: 0.9  # 0.0 to 1.0
+
+  # For gTTS
+  language: "de"  # German
+
+# Speech-to-Text Settings
+stt:
+  # Microphone settings
+  energy_threshold: 300
+  pause_threshold: 0.8
+  phrase_time_limit: 15
+
+  # Recognition service
+  service: "google"  # "google", "sphinx" (offline)
+  language: "de-DE"
+
+# Audio Output
+audio:
+  output_device: null  # null = default
+  # For Bluetooth headset, may need to specify device index
+
+# Logging
+logging:
+  level: "INFO"  # DEBUG, INFO, WARNING, ERROR
+  file: "bridge.log"
+
+# Safety
+safety:
+  max_speed: 70  # Maximum speed percentage
+  min_obstacle_distance: 20  # cm
+  command_timeout: 5  # seconds
@@ -0,0 +1,238 @@
+"""
+Claude's Eyes - ESP32 API Client
+
+Handles communication with the robot's REST API
+"""
+
+import requests
+from typing import Optional, Dict, Any
+from dataclasses import dataclass
+from io import BytesIO
+from PIL import Image
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class RobotStatus:
+    """Current robot status from sensors"""
+    distance_cm: float
+    battery_percent: int
+    current_action: str
+    wifi_rssi: int
+    uptime_seconds: int
+    servo_pan: int
+    servo_tilt: int
+    obstacle_warning: bool
+    obstacle_danger: bool
+    is_tilted: bool
+    is_moving: bool
+    imu: Dict[str, float]
+
+
+class ESP32Client:
+    """Client for communicating with the ESP32 robot"""
+
+    def __init__(self, host: str, port: int = 80, api_key: str = "", timeout: int = 10):
+        self.base_url = f"http://{host}:{port}"
+        self.api_key = api_key
+        self.timeout = timeout
+        self._session = requests.Session()
+
+    def _get(self, endpoint: str, params: Optional[Dict] = None) -> requests.Response:
+        """Make GET request with API key"""
+        if params is None:
+            params = {}
+        params["key"] = self.api_key
+
+        url = f"{self.base_url}{endpoint}"
+        logger.debug(f"GET {url}")
+
+        response = self._session.get(url, params=params, timeout=self.timeout)
+        response.raise_for_status()
+        return response
+
+    def _post(self, endpoint: str, data: Dict) -> requests.Response:
+        """Make POST request with API key"""
+        url = f"{self.base_url}{endpoint}?key={self.api_key}"
+        logger.debug(f"POST {url} with {data}")
+
+        response = self._session.post(url, json=data, timeout=self.timeout)
+        response.raise_for_status()
+        return response
+
+    def capture_image(self, resolution: str = "VGA", quality: int = 12) -> bytes:
+        """
+        Capture image from robot camera
+
+        Args:
+            resolution: QVGA, VGA, SVGA, XGA, SXGA, UXGA
+            quality: 10-63 (lower = better)
+
+        Returns:
+            JPEG image data as bytes
+        """
+        params = {
+            "resolution": resolution,
+            "quality": quality
+        }
+        response = self._get("/api/capture", params)
+        logger.info(f"Captured image: {len(response.content)} bytes")
+        return response.content
+
+    def capture_image_pil(self, resolution: str = "VGA", quality: int = 12) -> Image.Image:
+        """Capture image and return as PIL Image"""
+        image_data = self.capture_image(resolution, quality)
+        return Image.open(BytesIO(image_data))
+
+    def get_status(self) -> RobotStatus:
+        """Get current robot status from sensors"""
+        response = self._get("/api/status")
+        data = response.json()
+
+        return RobotStatus(
+            distance_cm=data.get("distance_cm", 0),
+            battery_percent=data.get("battery_percent", 100),
+            current_action=data.get("current_action", "unknown"),
+            wifi_rssi=data.get("wifi_rssi", 0),
+            uptime_seconds=data.get("uptime_seconds", 0),
+            servo_pan=data.get("servo_pan", 90),
+            servo_tilt=data.get("servo_tilt", 90),
+            obstacle_warning=data.get("obstacle_warning", False),
+            obstacle_danger=data.get("obstacle_danger", False),
+            is_tilted=data.get("is_tilted", False),
+            is_moving=data.get("is_moving", False),
+            imu=data.get("imu", {})
+        )
+
+    def send_command(self, action: str, speed: int = 50, duration_ms: int = 500,
+                     pan: Optional[int] = None, tilt: Optional[int] = None) -> Dict[str, Any]:
+        """
+        Send movement command to robot
+
+        Args:
+            action: forward, backward, left, right, stop,
+                   look_left, look_right, look_up, look_down, look_center, look_custom
+            speed: 0-100 percent
+            duration_ms: Duration in milliseconds
+            pan: Custom pan angle (for look_custom)
+            tilt: Custom tilt angle (for look_custom)
+
+        Returns:
+            Response from robot
+        """
+        data = {
+            "action": action,
+            "speed": speed,
+            "duration_ms": duration_ms
+        }
+
+        if pan is not None:
+            data["pan"] = pan
+        if tilt is not None:
+            data["tilt"] = tilt
+
+        response = self._post("/api/command", data)
+        result = response.json()
+        logger.info(f"Command {action}: {result.get('message', 'OK')}")
+        return result
+
+    # Convenience methods for common actions
+    def forward(self, speed: int = 50, duration_ms: int = 500) -> Dict:
+        return self.send_command("forward", speed, duration_ms)
+
+    def backward(self, speed: int = 50, duration_ms: int = 500) -> Dict:
+        return self.send_command("backward", speed, duration_ms)
+
+    def left(self, speed: int = 50, duration_ms: int = 500) -> Dict:
+        return self.send_command("left", speed, duration_ms)
+
+    def right(self, speed: int = 50, duration_ms: int = 500) -> Dict:
+        return self.send_command("right", speed, duration_ms)
+
+    def stop(self) -> Dict:
+        return self.send_command("stop")
+
+    def look_left(self) -> Dict:
+        return self.send_command("look_left")
+
+    def look_right(self) -> Dict:
+        return self.send_command("look_right")
+
+    def look_up(self) -> Dict:
+        return self.send_command("look_up")
+
+    def look_down(self) -> Dict:
+        return self.send_command("look_down")
+
+    def look_center(self) -> Dict:
+        return self.send_command("look_center")
+
+    def look_custom(self, pan: int, tilt: int) -> Dict:
+        return self.send_command("look_custom", pan=pan, tilt=tilt)
+
+    def set_claude_text(self, text: str) -> Dict:
+        """Set text that Claude wants to say/display"""
+        response = self._post("/api/claude_text", {"text": text})
+        return response.json()
+
+    def get_claude_text(self) -> Dict[str, Any]:
+        """Get last Claude text (for TTS)"""
+        response = self._get("/api/claude_text")
+        return response.json()
+
+    def set_display(self, mode: str, content: str = "") -> Dict:
+        """
+        Control robot display
+
+        Args:
+            mode: "text", "emoji", "status"
+            content: Text to show or emoji name (happy, thinking, surprised, sleepy, curious, confused)
+        """
+        response = self._post("/api/display", {"mode": mode, "content": content})
+        return response.json()
+
+    def is_connected(self) -> bool:
+        """Check if robot is reachable"""
+        try:
+            self.get_status()
+            return True
+        except Exception as e:
+            logger.warning(f"Connection check failed: {e}")
+            return False
+
+
+# Test when run directly
+if __name__ == "__main__":
+    import sys
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    if len(sys.argv) < 2:
+        print("Usage: python esp32_client.py <robot_ip>")
+        sys.exit(1)
+
+    host = sys.argv[1]
+    api_key = "claudes_eyes_secret_2025"
+
+    client = ESP32Client(host, api_key=api_key)
+
+    print(f"Connecting to {host}...")
+    if client.is_connected():
+        print("Connected!")
+
+        status = client.get_status()
+        print(f"\nStatus:")
+        print(f"  Distance: {status.distance_cm} cm")
+        print(f"  Battery: {status.battery_percent}%")
+        print(f"  Action: {status.current_action}")
+        print(f"  WiFi RSSI: {status.wifi_rssi} dBm")
+
+        print("\nCapturing image...")
+        img = client.capture_image_pil()
+        print(f"  Size: {img.size}")
+        img.save("test_capture.jpg")
+        print("  Saved to test_capture.jpg")
+    else:
+        print("Could not connect to robot!")
@@ -0,0 +1,41 @@
+# Claude's Eyes - Python Bridge Dependencies
+# Install with: pip install -r requirements.txt
+
+# HTTP requests to ESP32
+requests>=2.31.0
+
+# Configuration
+pyyaml>=6.0.1
+
+# Text-to-Speech
+pyttsx3>=2.90
+# Alternative: gTTS for Google TTS
+gTTS>=2.4.0
+
+# Speech-to-Text
+SpeechRecognition>=3.10.0
+# PyAudio for microphone access (may need special install on Windows)
+# Windows: pip install pipwin && pipwin install pyaudio
+# Linux: sudo apt install python3-pyaudio
+PyAudio>=0.2.13
+
+# Browser automation for Claude chat
+selenium>=4.16.0
+webdriver-manager>=4.0.1
+
+# Image handling
+Pillow>=10.2.0
+
+# Audio playback
+pygame>=2.5.2
+
+# Async support
+aiohttp>=3.9.0
+asyncio-throttle>=1.0.2
+
+# CLI interface
+rich>=13.7.0
+click>=8.1.7
+
+# Optional: Claude API direct access (alternative to browser)
+anthropic>=0.39.0
@@ -0,0 +1,216 @@
+"""
+Claude's Eyes - Speech-to-Text Engine
+
+Converts Stefan's speech to text for Claude
+"""
+
+import logging
+import threading
+import queue
+from typing import Optional, Callable
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SpeechResult:
+    """Result from speech recognition"""
+    text: str
+    confidence: float
+    is_final: bool
+
+
+class STTEngine:
+    """Speech-to-Text engine using SpeechRecognition library"""
+
+    def __init__(
+        self,
+        energy_threshold: int = 300,
+        pause_threshold: float = 0.8,
+        phrase_time_limit: int = 15,
+        service: str = "google",
+        language: str = "de-DE"
+    ):
+        import speech_recognition as sr
+
+        self.recognizer = sr.Recognizer()
+        self.microphone = sr.Microphone()
+
+        # Configure recognizer
+        self.recognizer.energy_threshold = energy_threshold
+        self.recognizer.pause_threshold = pause_threshold
+        self.recognizer.phrase_time_limit = phrase_time_limit
+
+        self.service = service
+        self.language = language
+
+        self._listening = False
+        self._callback: Optional[Callable[[SpeechResult], None]] = None
+        self._stop_flag = False
+        self._thread: Optional[threading.Thread] = None
+        self._results_queue = queue.Queue()
+
+        # Calibrate microphone
+        logger.info("Calibrating microphone...")
+        with self.microphone as source:
+            self.recognizer.adjust_for_ambient_noise(source, duration=1)
+        logger.info(f"Energy threshold set to {self.recognizer.energy_threshold}")
+
+        logger.info(f"STT engine initialized (service: {service}, language: {language})")
+
+    def listen_once(self, timeout: Optional[float] = None) -> Optional[SpeechResult]:
+        """
+        Listen for a single phrase (blocking)
+
+        Args:
+            timeout: Maximum time to wait for speech start
+
+        Returns:
+            SpeechResult or None if nothing recognized
+        """
+        import speech_recognition as sr
+
+        try:
+            with self.microphone as source:
+                logger.debug("Listening...")
+                audio = self.recognizer.listen(source, timeout=timeout)
+
+            return self._recognize(audio)
+
+        except sr.WaitTimeoutError:
+            logger.debug("Listen timeout")
+            return None
+        except Exception as e:
+            logger.error(f"Listen error: {e}")
+            return None
+
+    def _recognize(self, audio) -> Optional[SpeechResult]:
+        """Recognize speech from audio data"""
+        import speech_recognition as sr
+
+        try:
+            if self.service == "google":
+                text = self.recognizer.recognize_google(audio, language=self.language)
+                return SpeechResult(text=text, confidence=0.9, is_final=True)
+
+            elif self.service == "sphinx":
+                # Offline recognition (needs pocketsphinx)
+                text = self.recognizer.recognize_sphinx(audio)
+                return SpeechResult(text=text, confidence=0.7, is_final=True)
+
+            else:
+                logger.error(f"Unknown service: {self.service}")
+                return None
+
+        except sr.UnknownValueError:
+            logger.debug("Could not understand audio")
+            return None
+        except sr.RequestError as e:
+            logger.error(f"Recognition service error: {e}")
+            return None
+
+    def start_continuous(self, callback: Callable[[SpeechResult], None]) -> None:
+        """
+        Start continuous listening in background
+
+        Args:
+            callback: Function called with each recognized phrase
+        """
+        if self._listening:
+            logger.warning("Already listening")
+            return
+
+        self._callback = callback
+        self._stop_flag = False
+        self._listening = True
+
+        self._thread = threading.Thread(target=self._listen_loop, daemon=True)
+        self._thread.start()
+
+        logger.info("Continuous listening started")
+
+    def stop_continuous(self) -> None:
+        """Stop continuous listening"""
+        self._stop_flag = True
+        self._listening = False
+
+        if self._thread:
+            self._thread.join(timeout=2)
+            self._thread = None
+
+        logger.info("Continuous listening stopped")
+
+    def _listen_loop(self):
+        """Background thread for continuous listening"""
+        import speech_recognition as sr
+
+        while not self._stop_flag:
+            try:
+                with self.microphone as source:
+                    # Short timeout to allow stop checks
+                    try:
+                        audio = self.recognizer.listen(source, timeout=1, phrase_time_limit=self.recognizer.phrase_time_limit)
+                    except sr.WaitTimeoutError:
+                        continue
+
+                result = self._recognize(audio)
+                if result and self._callback:
+                    self._callback(result)
+
+            except Exception as e:
+                if not self._stop_flag:
+                    logger.error(f"Listen loop error: {e}")
+
+    def is_listening(self) -> bool:
+        return self._listening
+
+    def get_result_nonblocking(self) -> Optional[SpeechResult]:
+        """Get result without blocking (for use with async callback)"""
+        try:
+            return self._results_queue.get_nowait()
+        except queue.Empty:
+            return None
+
+
+def create_stt_engine(**kwargs) -> STTEngine:
+    """Factory function to create STT engine"""
+    return STTEngine(
+        energy_threshold=kwargs.get("energy_threshold", 300),
+        pause_threshold=kwargs.get("pause_threshold", 0.8),
+        phrase_time_limit=kwargs.get("phrase_time_limit", 15),
+        service=kwargs.get("service", "google"),
+        language=kwargs.get("language", "de-DE")
+    )
+
+
+# Test when run directly
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+
+    print("Speech-to-Text Test")
+    print("=" * 40)
+
+    engine = create_stt_engine(language="de-DE")
+
+    print("\nSag etwas (du hast 10 Sekunden)...")
+    result = engine.listen_once(timeout=10)
+
+    if result:
+        print(f"\nErkannt: '{result.text}'")
+        print(f"Konfidenz: {result.confidence:.0%}")
+    else:
+        print("\nNichts erkannt.")
+
+    print("\n\nKontinuierlicher Modus (5 Sekunden)...")
+
+    def on_speech(result: SpeechResult):
+        print(f"  -> {result.text}")
+
+    engine.start_continuous(on_speech)
+
+    import time
+    time.sleep(5)
+
+    engine.stop_continuous()
+    print("\nDone!")
@@ -0,0 +1,229 @@
+"""
+Claude's Eyes - Text-to-Speech Engine
+
+Converts Claude's text responses to spoken audio
+"""
+
+import logging
+import threading
+import queue
+from typing import Optional
+from abc import ABC, abstractmethod
+
+logger = logging.getLogger(__name__)
+
+
+class TTSEngine(ABC):
+    """Abstract base class for TTS engines"""
+
+    @abstractmethod
+    def speak(self, text: str) -> None:
+        """Speak the given text (blocking)"""
+        pass
+
+    @abstractmethod
+    def speak_async(self, text: str) -> None:
+        """Speak the given text (non-blocking)"""
+        pass
+
+    @abstractmethod
+    def stop(self) -> None:
+        """Stop current speech"""
+        pass
+
+    @abstractmethod
+    def is_speaking(self) -> bool:
+        """Check if currently speaking"""
+        pass
+
+
+class Pyttsx3Engine(TTSEngine):
+    """TTS using pyttsx3 (offline, system voices)"""
+
+    def __init__(self, voice: Optional[str] = None, rate: int = 150, volume: float = 0.9):
+        import pyttsx3
+
+        self.engine = pyttsx3.init()
+        self.engine.setProperty('rate', rate)
+        self.engine.setProperty('volume', volume)
+
+        # Set voice if specified
+        if voice:
+            voices = self.engine.getProperty('voices')
+            for v in voices:
+                if voice.lower() in v.name.lower():
+                    self.engine.setProperty('voice', v.id)
+                    break
+
+        self._speaking = False
+        self._queue = queue.Queue()
+        self._thread: Optional[threading.Thread] = None
+        self._stop_flag = False
+
+        logger.info("Pyttsx3 TTS engine initialized")
+
+    def speak(self, text: str) -> None:
+        """Speak text (blocking)"""
+        self._speaking = True
+        try:
+            self.engine.say(text)
+            self.engine.runAndWait()
+        finally:
+            self._speaking = False
+
+    def speak_async(self, text: str) -> None:
+        """Speak text (non-blocking)"""
+        self._queue.put(text)
+
+        if self._thread is None or not self._thread.is_alive():
+            self._stop_flag = False
+            self._thread = threading.Thread(target=self._speech_worker, daemon=True)
+            self._thread.start()
+
+    def _speech_worker(self):
+        """Worker thread for async speech"""
+        while not self._stop_flag:
+            try:
+                text = self._queue.get(timeout=0.5)
+                self.speak(text)
+                self._queue.task_done()
+            except queue.Empty:
+                continue
+
+    def stop(self) -> None:
+        """Stop current speech"""
+        self._stop_flag = True
+        self.engine.stop()
+        # Clear queue
+        while not self._queue.empty():
+            try:
+                self._queue.get_nowait()
+            except queue.Empty:
+                break
+
+    def is_speaking(self) -> bool:
+        return self._speaking
+
+
+class GTTSEngine(TTSEngine):
+    """TTS using Google Text-to-Speech (online, better quality)"""
+
+    def __init__(self, language: str = "de"):
+        from gtts import gTTS
+        import pygame
+
+        pygame.mixer.init()
+
+        self.language = language
+        self._speaking = False
+        self._queue = queue.Queue()
+        self._thread: Optional[threading.Thread] = None
+        self._stop_flag = False
+
+        logger.info(f"gTTS engine initialized (language: {language})")
+
+    def speak(self, text: str) -> None:
+        """Speak text (blocking)"""
+        from gtts import gTTS
+        import pygame
+        import tempfile
+        import os
+
+        self._speaking = True
+        try:
+            # Generate audio file
+            tts = gTTS(text=text, lang=self.language)
+
+            with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
+                temp_path = f.name
+                tts.save(temp_path)
+
+            # Play audio
+            pygame.mixer.music.load(temp_path)
+            pygame.mixer.music.play()
+
+            # Wait for playback to finish
+            while pygame.mixer.music.get_busy():
+                pygame.time.Clock().tick(10)
+
+            # Cleanup
+            os.unlink(temp_path)
+
+        except Exception as e:
+            logger.error(f"gTTS error: {e}")
+        finally:
+            self._speaking = False
+
+    def speak_async(self, text: str) -> None:
+        """Speak text (non-blocking)"""
+        self._queue.put(text)
+
+        if self._thread is None or not self._thread.is_alive():
+            self._stop_flag = False
+            self._thread = threading.Thread(target=self._speech_worker, daemon=True)
+            self._thread.start()
+
+    def _speech_worker(self):
+        """Worker thread for async speech"""
+        while not self._stop_flag:
+            try:
+                text = self._queue.get(timeout=0.5)
+                self.speak(text)
+                self._queue.task_done()
+            except queue.Empty:
+                continue
+
+    def stop(self) -> None:
+        """Stop current speech"""
+        import pygame
+        self._stop_flag = True
+        pygame.mixer.music.stop()
+        # Clear queue
+        while not self._queue.empty():
+            try:
+                self._queue.get_nowait()
+            except queue.Empty:
+                break
+
+    def is_speaking(self) -> bool:
+        return self._speaking
+
+
+def create_tts_engine(engine_type: str = "pyttsx3", **kwargs) -> TTSEngine:
+    """
+    Factory function to create TTS engine
+
+    Args:
+        engine_type: "pyttsx3" or "gtts"
+        **kwargs: Engine-specific options
+    """
+    if engine_type == "pyttsx3":
+        return Pyttsx3Engine(
+            voice=kwargs.get("voice"),
+            rate=kwargs.get("rate", 150),
+            volume=kwargs.get("volume", 0.9)
+        )
+    elif engine_type == "gtts":
+        return GTTSEngine(
+            language=kwargs.get("language", "de")
+        )
+    else:
+        raise ValueError(f"Unknown TTS engine: {engine_type}")
+
+
+# Test when run directly
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+
+    print("Testing pyttsx3...")
+    engine = create_tts_engine("pyttsx3", rate=150)
+    engine.speak("Hallo! Ich bin Claude und erkunde gerade deine Wohnung.")
+
+    print("\nTesting gTTS...")
+    try:
+        engine2 = create_tts_engine("gtts", language="de")
+        engine2.speak("Das hier klingt noch besser!")
+    except Exception as e:
+        print(f"gTTS not available: {e}")
+
+    print("\nDone!")