feat(flux): Bildgenerierung via FLUX.1-dev — flux-bridge auf Gamebox

Eigener Compose-Stack im /flux Verzeichnis (kann auf separater Maschine laufen). aria-bridge routet flux_request via RVS, ARIA referenziert das fertige PNG im Reply mit [FILE: ...]-Marker. Brain-Tool flux_generate mit Caps fuer steps/dimension. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 22:33:48 +02:00
parent 33d5be781f
commit 7e53dcfed3
12 changed files with 984 additions and 2 deletions
@@ -0,0 +1,36 @@
+# ════════════════════════════════════════════════
+#  ARIA FLUX-Bridge — Konfiguration
+#  Kopieren nach .env und anpassen
+# ════════════════════════════════════════════════
+
+# RVS Verbindung (gleiche Daten wie auf der ARIA-VM / xtts/.env)
+RVS_HOST=mobil.hacker-net.de
+RVS_PORT=444
+RVS_TLS=true
+RVS_TLS_FALLBACK=true
+RVS_TOKEN=dein_token_hier
+
+# HuggingFace-Token — FLUX.1-dev ist gated (auf
+# https://huggingface.co/black-forest-labs/FLUX.1-dev "Agree" klicken,
+# dann unter https://huggingface.co/settings/tokens ein "Read"-Token
+# erzeugen). Fuer FLUX.1-schnell nicht noetig.
+HF_TOKEN=
+
+# Modell:
+#   black-forest-labs/FLUX.1-dev      (Default, ~24 GB, non-commercial)
+#   black-forest-labs/FLUX.1-schnell  (4 Steps, Apache-2.0, schneller)
+FLUX_MODEL=black-forest-labs/FLUX.1-dev
+
+# Offloading-Strategie (VRAM-Steuerung):
+#   model       — Default. Komponentenweise CPU-Offload, gut fuer 12 GB Karten.
+#   sequential  — sparsamer (Peak ~6 GB), aber 2-3x langsamer.
+#   none        — alles auf GPU. Nur fuer >= 24 GB VRAM-Karten.
+FLUX_OFFLOAD=model
+
+# Float-Type. bfloat16 ist FLUX-native; auf alten Karten ohne BF16-Support
+# auf float16 wechseln.
+FLUX_DTYPE=bfloat16
+
+# Hard-Caps gegen versehentlich teure Renders
+FLUX_MAX_STEPS=50
+FLUX_MAX_DIM=1536
@@ -0,0 +1,5 @@
+# HuggingFace Model-Cache (FLUX.1-dev ~24 GB on disk)
+hf-cache/
+
+# Docker .env
+.env
@@ -0,0 +1,23 @@
+FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 python3-pip git \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# PyTorch CUDA-Wheels zuerst, damit diffusers nicht CPU-Torch zieht.
+# Versionsmatrix wie bei f5tts gehalten (cu121, Torch 2.3.1) — gleicher
+# Treiber-Footprint, gleicher HF-Cache-Pfad.
+RUN pip3 install --no-cache-dir torch==2.3.1 \
+    --index-url https://download.pytorch.org/whl/cu121
+
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+COPY bridge.py .
+
+CMD ["python3", "bridge.py"]
@@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+"""
+ARIA FLUX-Bridge — laeuft auf der Gamebox (RTX 3060).
+
+Empfaengt flux_request via RVS → FLUX.1-dev/-schnell auf GPU → sendet
+flux_response mit base64-PNG zurueck an die aria-bridge. Diese speichert
+die Datei nach /shared/uploads/ und ARIA referenziert sie mit
+[FILE: ...]-Marker in ihrer Antwort.
+
+12 GB VRAM auf der 3060 reichen fuer FLUX.1-dev nur mit
+`enable_model_cpu_offload()` — sonst OOM. Setze FLUX_OFFLOAD=sequential
+fuer Maximal-Sparsamkeit (langsamer) oder FLUX_OFFLOAD=none wenn die
+GPU genug VRAM hat (z.B. spaeter 4090).
+
+Env:
+  RVS_HOST, RVS_PORT, RVS_TLS, RVS_TLS_FALLBACK, RVS_TOKEN
+  FLUX_MODEL         Default: black-forest-labs/FLUX.1-dev
+                     Alt: black-forest-labs/FLUX.1-schnell (4-Step, Apache-2.0)
+  FLUX_DEVICE        Default: cuda
+  FLUX_DTYPE         Default: bfloat16 (alt: float16)
+  FLUX_OFFLOAD       Default: model (alt: sequential | none)
+  FLUX_MAX_STEPS     Default: 50
+  FLUX_MAX_DIM       Default: 1536
+"""
+import asyncio
+import base64
+import io
+import json
+import logging
+import os
+import sys
+import time
+import uuid
+from typing import Optional
+
+import websockets
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger("flux-bridge")
+# HuggingFace/Torch download-Logs daempfen
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+
+RVS_HOST = os.getenv("RVS_HOST", "").strip()
+RVS_PORT = int(os.getenv("RVS_PORT", "443"))
+RVS_TLS = os.getenv("RVS_TLS", "true").lower() == "true"
+RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true").lower() == "true"
+RVS_TOKEN = os.getenv("RVS_TOKEN", "").strip()
+
+FLUX_MODEL = os.getenv("FLUX_MODEL", "black-forest-labs/FLUX.1-dev").strip()
+FLUX_DEVICE = os.getenv("FLUX_DEVICE", "cuda").strip()
+FLUX_DTYPE = os.getenv("FLUX_DTYPE", "bfloat16").strip().lower()
+FLUX_OFFLOAD = os.getenv("FLUX_OFFLOAD", "model").strip().lower()
+FLUX_MAX_STEPS = int(os.getenv("FLUX_MAX_STEPS", "50"))
+FLUX_MAX_DIM = int(os.getenv("FLUX_MAX_DIM", "1536"))
+
+# FLUX-dev native: guidance=3.5, steps=28. FLUX-schnell: guidance=0.0, steps=4.
+DEFAULT_STEPS_DEV = 28
+DEFAULT_STEPS_SCHNELL = 4
+DEFAULT_GUIDANCE_DEV = 3.5
+DEFAULT_GUIDANCE_SCHNELL = 0.0
+
+
+def _is_schnell(model_id: str) -> bool:
+    return "schnell" in model_id.lower()
+
+
+def _torch_dtype():
+    """Lazy-resolve damit Torch erst beim Modell-Laden importiert wird."""
+    import torch
+    return {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}\
+        .get(FLUX_DTYPE, torch.bfloat16)
+
+
+def _snap_dim(v: int, default: int = 1024) -> int:
+    """FLUX braucht Multiples von 16 (sicher: 64). Clamp + Snap."""
+    try:
+        n = int(v)
+    except (TypeError, ValueError):
+        n = default
+    n = max(256, min(FLUX_MAX_DIM, n))
+    # Auf naechstes Vielfaches von 64 abrunden
+    n = (n // 64) * 64
+    return max(256, n)
+
+
+class FluxRunner:
+    """Haelt die FLUX-Pipeline. Synthese laeuft im Executor (blocking).
+
+    GPU ist die Engstelle — wir serialisieren via Queue im Caller, hier
+    nur Single-Lock fuer load. Ein Render auf der 3060 dauert je nach
+    Steps/Aufloesung 20-90 s.
+    """
+
+    def __init__(self) -> None:
+        self.pipe = None
+        self._lock = asyncio.Lock()
+        self.model_id: str = FLUX_MODEL
+        self.last_load_seconds: float = 0.0
+
+    def _load_blocking(self) -> None:
+        import torch
+        from diffusers import FluxPipeline
+
+        logger.info("Lade FLUX '%s' (dtype=%s, offload=%s)...",
+                    self.model_id, FLUX_DTYPE, FLUX_OFFLOAD)
+        t0 = time.time()
+        pipe = FluxPipeline.from_pretrained(self.model_id, torch_dtype=_torch_dtype())
+
+        if FLUX_OFFLOAD == "sequential":
+            pipe.enable_sequential_cpu_offload()
+        elif FLUX_OFFLOAD == "none":
+            pipe.to(FLUX_DEVICE)
+        else:  # "model" — default, Sweet-Spot fuer 12 GB Karten
+            pipe.enable_model_cpu_offload()
+
+        # VAE-Tiling spart VRAM bei grossen Bildern (>1024)
+        try:
+            pipe.vae.enable_tiling()
+        except Exception:
+            pass
+
+        self.pipe = pipe
+        self.last_load_seconds = time.time() - t0
+        logger.info("FLUX geladen in %.1fs", self.last_load_seconds)
+        # CUDA-Cache nach dem Load aufraeumen
+        try:
+            torch.cuda.empty_cache()
+        except Exception:
+            pass
+
+    async def ensure_loaded(self) -> None:
+        async with self._lock:
+            if self.pipe is not None:
+                return
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, self._load_blocking)
+
+    def _generate_blocking(self, prompt: str, width: int, height: int,
+                           steps: int, guidance: float, seed: Optional[int]) -> bytes:
+        import torch
+        gen = None
+        if seed is not None and seed >= 0:
+            gen = torch.Generator(device=FLUX_DEVICE).manual_seed(int(seed))
+
+        logger.info("Render: %dx%d, steps=%d, guidance=%.2f, seed=%s, prompt=%r",
+                    width, height, steps, guidance, seed, prompt[:80])
+        out = self.pipe(
+            prompt=prompt,
+            width=width,
+            height=height,
+            num_inference_steps=steps,
+            guidance_scale=guidance,
+            generator=gen,
+        )
+        image = out.images[0]
+        buf = io.BytesIO()
+        image.save(buf, format="PNG", optimize=True)
+        png_bytes = buf.getvalue()
+        # VRAM zurueckgeben fuer den naechsten Render
+        try:
+            torch.cuda.empty_cache()
+        except Exception:
+            pass
+        return png_bytes
+
+    async def generate(self, prompt: str, width: int, height: int,
+                       steps: int, guidance: float, seed: Optional[int]) -> bytes:
+        await self.ensure_loaded()
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(
+            None, self._generate_blocking, prompt, width, height, steps, guidance, seed,
+        )
+
+
+# ── Helpers ─────────────────────────────────────────────────
+
+
+async def _send(ws, mtype: str, payload: dict) -> None:
+    try:
+        await ws.send(json.dumps({
+            "type": mtype,
+            "payload": payload,
+            "timestamp": int(time.time() * 1000),
+        }))
+    except Exception as e:
+        logger.warning("Send fehlgeschlagen (%s): %s", mtype, e)
+
+
+async def _broadcast_status(ws, state: str, **extra) -> None:
+    """Sendet service_status fuer das Flux-Modul.
+    state: 'loading' | 'ready' | 'error'."""
+    payload = {"service": "flux", "state": state}
+    payload.update(extra)
+    await _send(ws, "service_status", payload)
+
+
+# ── Flux-Request Queue ──────────────────────────────────────
+
+# Eine GPU, ein Render gleichzeitig. Parallele Requests OOM-en sonst.
+_flux_queue: "asyncio.Queue[tuple]" = asyncio.Queue()
+
+
+def _resolve_request(payload: dict, runner: FluxRunner) -> tuple[str, int, int, int, float, Optional[int]]:
+    """Liest Felder aus dem flux_request payload + clampt auf Caps."""
+    prompt = (payload.get("prompt") or "").strip()
+    if not prompt:
+        raise ValueError("prompt fehlt")
+    if len(prompt) > 2000:
+        prompt = prompt[:2000]
+
+    width = _snap_dim(payload.get("width", 1024))
+    height = _snap_dim(payload.get("height", 1024))
+
+    schnell = _is_schnell(runner.model_id)
+    default_steps = DEFAULT_STEPS_SCHNELL if schnell else DEFAULT_STEPS_DEV
+    default_guidance = DEFAULT_GUIDANCE_SCHNELL if schnell else DEFAULT_GUIDANCE_DEV
+
+    try:
+        steps = int(payload.get("steps", default_steps))
+    except (TypeError, ValueError):
+        steps = default_steps
+    steps = max(1, min(FLUX_MAX_STEPS, steps))
+
+    try:
+        guidance = float(payload.get("guidance_scale", default_guidance))
+    except (TypeError, ValueError):
+        guidance = default_guidance
+    if not (0.0 <= guidance <= 20.0):
+        guidance = default_guidance
+
+    seed = payload.get("seed")
+    if seed is not None:
+        try:
+            seed = int(seed)
+        except (TypeError, ValueError):
+            seed = None
+
+    return prompt, width, height, steps, guidance, seed
+
+
+async def _flux_worker(ws, runner: FluxRunner) -> None:
+    """Serialisiert Renders — eine GPU, ein Bild gleichzeitig."""
+    while True:
+        payload = await _flux_queue.get()
+        request_id = payload.get("requestId") or str(uuid.uuid4())
+        try:
+            await _do_render(ws, runner, payload, request_id)
+        except Exception:
+            logger.exception("Flux-Worker Fehler")
+            await _send(ws, "flux_response", {
+                "requestId": request_id,
+                "error": "internal error",
+            })
+        finally:
+            _flux_queue.task_done()
+
+
+async def _do_render(ws, runner: FluxRunner, payload: dict, request_id: str) -> None:
+    t0 = time.time()
+    try:
+        prompt, width, height, steps, guidance, seed = _resolve_request(payload, runner)
+    except ValueError as e:
+        logger.warning("flux_request invalid: %s", e)
+        await _send(ws, "flux_response", {"requestId": request_id, "error": str(e)})
+        return
+
+    # Progress-Ping: User soll sehen dass was passiert (Render >30s realistisch)
+    await _send(ws, "flux_response", {
+        "requestId": request_id,
+        "state": "rendering",
+        "width": width, "height": height, "steps": steps,
+    })
+
+    try:
+        png = await runner.generate(prompt, width, height, steps, guidance, seed)
+    except Exception as e:
+        logger.exception("FLUX Render-Fehler")
+        await _send(ws, "flux_response", {"requestId": request_id, "error": str(e)[:200]})
+        return
+
+    dt = time.time() - t0
+    b64 = base64.b64encode(png).decode("ascii")
+    logger.info("Render fertig: %dx%d, %d KB PNG, %.1fs", width, height, len(png) // 1024, dt)
+
+    await _send(ws, "flux_response", {
+        "requestId": request_id,
+        "state": "done",
+        "base64": b64,
+        "mimeType": "image/png",
+        "width": width,
+        "height": height,
+        "steps": steps,
+        "guidance": guidance,
+        "seed": seed,
+        "model": runner.model_id,
+        "renderSeconds": round(dt, 2),
+        "sizeBytes": len(png),
+    })
+
+
+# ── Haupt-Loop ──────────────────────────────────────────────
+
+
+async def run_loop(runner: FluxRunner) -> None:
+    use_tls = RVS_TLS
+    retry_s = 2
+    tls_fallback_tried = False
+
+    while True:
+        scheme = "wss" if use_tls else "ws"
+        url = f"{scheme}://{RVS_HOST}:{RVS_PORT}/ws?token={RVS_TOKEN}"
+        masked = url.replace(RVS_TOKEN, "***") if RVS_TOKEN else url
+
+        try:
+            logger.info("Verbinde zu RVS: %s", masked)
+            # max_size 100 MB damit ein 4 MP PNG (~5-10 MB → ~13 MB base64)
+            # locker reinpasst. Mit dem RVS-Limit (100 MB) konsistent.
+            async with websockets.connect(url, ping_interval=20, ping_timeout=10,
+                                          max_size=100 * 1024 * 1024) as ws:
+                logger.info("RVS verbunden")
+                retry_s = 2
+                tls_fallback_tried = False
+
+                async def _load_with_status():
+                    try:
+                        if runner.pipe is not None:
+                            logger.info("Initial: broadcaste ready (Pipeline schon im RAM: %s)",
+                                        runner.model_id)
+                            await _broadcast_status(ws, "ready",
+                                                    model=runner.model_id,
+                                                    loadSeconds=runner.last_load_seconds)
+                        else:
+                            logger.info("Initial: broadcaste loading + lade '%s'", runner.model_id)
+                            await _broadcast_status(ws, "loading", model=runner.model_id)
+                            await runner.ensure_loaded()
+                            await _broadcast_status(ws, "ready",
+                                                    model=runner.model_id,
+                                                    loadSeconds=runner.last_load_seconds)
+                    except Exception as e:
+                        logger.exception("Initial-Load crashed: %s", e)
+                        try:
+                            await _broadcast_status(ws, "error", error=str(e)[:200])
+                        except Exception:
+                            pass
+                asyncio.create_task(_load_with_status())
+
+                worker = asyncio.create_task(_flux_worker(ws, runner))
+
+                try:
+                    async for raw in ws:
+                        try:
+                            msg = json.loads(raw)
+                        except Exception:
+                            continue
+                        mtype = msg.get("type", "")
+                        payload = msg.get("payload", {}) or {}
+
+                        if mtype == "flux_request":
+                            await _flux_queue.put(payload)
+                finally:
+                    worker.cancel()
+                    try:
+                        await worker
+                    except asyncio.CancelledError:
+                        pass
+        except Exception as e:
+            logger.warning("Verbindung verloren: %s", e)
+            if use_tls and RVS_TLS_FALLBACK and not tls_fallback_tried:
+                logger.info("TLS fehlgeschlagen — Fallback auf ws://")
+                use_tls = False
+                tls_fallback_tried = True
+                continue
+            await asyncio.sleep(min(retry_s, 30))
+            retry_s = min(retry_s * 2, 30)
+
+
+async def main() -> None:
+    if not RVS_HOST:
+        logger.error("RVS_HOST nicht gesetzt — Abbruch")
+        sys.exit(1)
+    runner = FluxRunner()
+    await run_loop(runner)
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        sys.exit(0)
@@ -0,0 +1,60 @@
+# ════════════════════════════════════════════════
+#  ARIA FLUX-Bridge — Text-to-Image (GPU)
+#  Eigener Stack, weil FLUX auch auf einer anderen
+#  Maschine als f5tts/whisper laufen kann (z.B. 4090
+#  separat vom Gaming-PC). Verbindet sich selbst per
+#  WebSocket zum RVS und lauscht auf flux_request.
+# ════════════════════════════════════════════════
+#
+#  Voraussetzungen:
+#    - NVIDIA-GPU mit >= 12 GB VRAM (3060 reicht mit
+#      enable_model_cpu_offload). Bei < 12 GB:
+#      FLUX_OFFLOAD=sequential setzen, sonst OOM.
+#    - Docker mit NVIDIA Container Toolkit
+#    - HuggingFace-Token in .env (FLUX.1-dev ist gated)
+#    - .env mit RVS-Verbindungsdaten (gleiche wie xtts!)
+#
+#  Start: docker compose up -d
+# ════════════════════════════════════════════════
+
+services:
+
+  # ─── FLUX.1-dev Bildgenerierung (GPU) ─────────
+  # Empfaengt flux_request via RVS, rendert PNG mit FLUX.1-dev (12B Params)
+  # und broadcastet flux_response mit base64-PNG zurueck. aria-bridge speichert
+  # die Datei nach /shared/uploads/ und ARIA referenziert sie via [FILE:]-Marker.
+  #
+  # Modell-Wahl per FLUX_MODEL:
+  #   - black-forest-labs/FLUX.1-dev      (Default, 28 Steps, non-commercial)
+  #   - black-forest-labs/FLUX.1-schnell  (4 Steps, Apache-2.0, schneller)
+  # HuggingFace-Token noetig fuer FLUX.1-dev — vorher `huggingface-cli login`
+  # oder HF_TOKEN in .env setzen, sonst 403 beim ersten Download.
+  flux-bridge:
+    build: .
+    container_name: aria-flux-bridge
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    environment:
+      - RVS_HOST=${RVS_HOST}
+      - RVS_PORT=${RVS_PORT:-443}
+      - RVS_TLS=${RVS_TLS:-true}
+      - RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true}
+      - RVS_TOKEN=${RVS_TOKEN}
+      - FLUX_MODEL=${FLUX_MODEL:-black-forest-labs/FLUX.1-dev}
+      - FLUX_DEVICE=${FLUX_DEVICE:-cuda}
+      - FLUX_DTYPE=${FLUX_DTYPE:-bfloat16}
+      - FLUX_OFFLOAD=${FLUX_OFFLOAD:-model}
+      - FLUX_MAX_STEPS=${FLUX_MAX_STEPS:-50}
+      - FLUX_MAX_DIM=${FLUX_MAX_DIM:-1536}
+      - HF_TOKEN=${HF_TOKEN:-}        # FLUX.1-dev braucht Login-Token
+    volumes:
+      - ./hf-cache:/root/.cache/huggingface      # Bind-Mount. FLUX.1-dev ~24 GB on disk!
+                                                 # Wenn flux auf der gleichen Maschine
+                                                 # wie xtts laeuft: ../xtts/hf-cache
+                                                 # symlinken um den Cache zu teilen.
+    restart: unless-stopped
@@ -0,0 +1,9 @@
+diffusers>=0.30.0
+transformers>=4.43.0
+accelerate>=0.33.0
+sentencepiece>=0.2.0
+protobuf>=4.25.0
+pillow>=10.0.0
+huggingface_hub>=0.24.0
+websockets>=12.0
+numpy>=1.24