From c677cfed2469d7eb290fbf940369e213c019e920 Mon Sep 17 00:00:00 2001 From: duffyduck Date: Wed, 13 May 2026 02:27:02 +0200 Subject: [PATCH] =?UTF-8?q?feat(brain):=20Anhaenge=20an=20Memory-Eintraege?= =?UTF-8?q?=20(Stufe=20A=20=E2=80=94=20Backend)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pro Memory koennen jetzt Dateien (Bilder, PDFs, Sound, ...) angehaengt werden. Use-Case: Stefan sagt "ich hab eine Cessna 172" und pinnt gleich ein Foto dran — ARIA sieht spaeter neben dem Memory auch die visuelle Referenz (Stufe E = Multi-Modal-Pipeline). Stufe A baut nur den Backend-Layer; UI kommt in Stufe B (Diagnostic) und C (App). Anhaenge werden in Stufe A nur via HTTP-API gepflegt (curl), ARIA selbst kann sie noch nicht hochladen — sinnvoll erst wenn die Vision-Pipeline (Stufe E) steht. Komponenten: - memory_attachments.py: neuer Storage-Helper. Layout /shared/memory-attachments//. Filename-Sanitization (kein Path-Traversal), Limit 20 MB konfigurierbar, save/list/delete/read_bytes + delete_all fuer Cleanup beim Memory-Delete. - vector_store.py: MemoryPoint.attachments (List[dict]) — Metadaten {name, mime, size, path} im Qdrant-Payload damit Suche/Anzeige sie ohne Filesystem-Lookup kennt. - main.py: - MemoryIn akzeptiert attachments-Liste (fuer Restore-Faelle) - MemoryOut liefert attachments - GET /memory/{id}/attachments → Liste vom FS - POST /memory/{id}/attachments → Base64-Upload, schreibt FS + updated Payload-Liste - DELETE /memory/{id}/attachments/{filename} → FS + Payload-Eintrag weg - GET /memory/{id}/attachments/{filename} → Bytes mit MIME serve - /memory/delete cleanup: ruft attachments.delete_all damit kein Verzeichnis verwaist Smoke-Test nach Brain-Rebuild (Stefan auf VM): # Memory-ID rauspicken ID=$(curl -s "$ARIA_BRAIN_URL/memory/list?type=fact" | python3 -c "import sys,json;print(json.load(sys.stdin)[0]['id'])") # Bild als Base64 hochladen B64=$(base64 -w0 /pfad/zu/foto.jpg) curl -s -X POST "$ARIA_BRAIN_URL/memory/$ID/attachments" \ -H 'Content-Type: application/json' \ -d "{\"name\":\"foto.jpg\",\"data_base64\":\"$B64\"}" | jq # Liste anzeigen curl -s "$ARIA_BRAIN_URL/memory/$ID/attachments" | jq # Datei wieder laden curl -s "$ARIA_BRAIN_URL/memory/$ID/attachments/foto.jpg" -o /tmp/back.jpg Stufe B (Diagnostic-UI) folgt sobald A getestet ist. Co-Authored-By: Claude Opus 4.7 (1M context) --- aria-brain/main.py | 106 +++++++++++++++++++++++ aria-brain/memory/vector_store.py | 7 ++ aria-brain/memory_attachments.py | 137 ++++++++++++++++++++++++++++++ 3 files changed, 250 insertions(+) create mode 100644 aria-brain/memory_attachments.py diff --git a/aria-brain/main.py b/aria-brain/main.py index c06cd92..3f241be 100644 --- a/aria-brain/main.py +++ b/aria-brain/main.py @@ -114,6 +114,10 @@ class MemoryIn(BaseModel): source: str = "manual" tags: List[str] = Field(default_factory=list) conversation_id: Optional[str] = None + # Vorhandene Anhang-Metadaten beim Save mitgeben (i.d.R. werden Anhaenge + # nach dem Save via /memory/{id}/attachments hinzugefuegt — hier eher fuer + # Bootstrap-Import/Restore-Faelle relevant). + attachments: List[dict] = Field(default_factory=list) class MemoryUpdate(BaseModel): @@ -137,12 +141,19 @@ class MemoryOut(BaseModel): updated_at: str conversation_id: Optional[str] = None score: Optional[float] = None + attachments: List[dict] = Field(default_factory=list) @classmethod def from_point(cls, p: MemoryPoint) -> "MemoryOut": return cls(**p.__dict__) +class AttachmentUploadBody(BaseModel): + """Base64-Upload via JSON — Diagnostic schickt Files so.""" + name: str + data_base64: str + + # ─── Health ─────────────────────────────────────────────────────────── @app.get("/health") @@ -231,6 +242,7 @@ def memory_save(body: MemoryIn): source=body.source, tags=body.tags, conversation_id=body.conversation_id, + attachments=body.attachments or [], ) pid = s.upsert(point, vec) saved = s.get(pid) @@ -279,9 +291,103 @@ def memory_delete(point_id: str): if not s.get(point_id): raise HTTPException(404, f"Memory {point_id} nicht gefunden") s.delete(point_id) + # Anhaenge mit-loeschen damit nichts verwaist + try: + import memory_attachments as mem_att + n = mem_att.delete_all(point_id) + if n: + logger.info("Memory %s + %d Anhaenge geloescht", point_id, n) + except Exception as exc: + logger.warning("Anhang-Cleanup fuer %s fehlgeschlagen: %s", point_id, exc) return {"deleted": point_id} +# ─── Memory-Anhaenge ────────────────────────────────────────────────── + +@app.get("/memory/{point_id}/attachments") +def memory_attachments_list(point_id: str): + """Liste der Anhaenge zum Memory. Source-of-Truth ist das Payload + in der DB, aber wir mergen vorsichtshalber mit dem Filesystem-Stand + (falls ein Upload-Restart zwischendrin schiefging).""" + import memory_attachments as mem_att + s = store() + m = s.get(point_id) + if not m: + raise HTTPException(404, f"Memory {point_id} nicht gefunden") + return {"memory_id": point_id, "attachments": mem_att.list_attachments(point_id)} + + +@app.post("/memory/{point_id}/attachments", response_model=MemoryOut) +def memory_attachments_add(point_id: str, body: AttachmentUploadBody): + """Anhang als Base64 hochladen + im Memory-Payload eintragen.""" + import memory_attachments as mem_att + s = store() + m = s.get(point_id) + if not m: + raise HTTPException(404, f"Memory {point_id} nicht gefunden") + try: + meta = mem_att.save_from_base64(point_id, body.name, body.data_base64) + except ValueError as exc: + raise HTTPException(400, str(exc)) + + # Payload aktualisieren — neuer Anhang ans Ende, Duplikate (gleicher + # Filename) werden ersetzt damit die Liste nicht zweimal denselben + # Eintrag hat + atts = [a for a in (m.attachments or []) if a.get("name") != meta["name"]] + atts.append(meta) + m.attachments = atts + from qdrant_client.http import models as qm + from memory.vector_store import COLLECTION + import datetime as _dt + m.updated_at = _dt.datetime.now(_dt.timezone.utc).isoformat() + s.client.set_payload( + collection_name=COLLECTION, + payload=m.to_payload() | {"updated_at": m.updated_at}, + points=[point_id], + ) + return MemoryOut.from_point(s.get(point_id)) + + +@app.delete("/memory/{point_id}/attachments/{filename}", response_model=MemoryOut) +def memory_attachments_delete(point_id: str, filename: str): + """Einzelnen Anhang loeschen (FS + Payload-Eintrag).""" + import memory_attachments as mem_att + s = store() + m = s.get(point_id) + if not m: + raise HTTPException(404, f"Memory {point_id} nicht gefunden") + removed_fs = mem_att.delete_attachment(point_id, filename) + safe = filename # Cleanup synchron mit FS — Payload-Match per name + atts = [a for a in (m.attachments or []) if a.get("name") not in (filename, safe)] + m.attachments = atts + from qdrant_client.http import models as qm + from memory.vector_store import COLLECTION + import datetime as _dt + m.updated_at = _dt.datetime.now(_dt.timezone.utc).isoformat() + s.client.set_payload( + collection_name=COLLECTION, + payload=m.to_payload() | {"updated_at": m.updated_at}, + points=[point_id], + ) + if not removed_fs and not atts: + # weder im FS noch im Payload war was — Anhang existierte nicht + raise HTTPException(404, f"Anhang {filename} nicht gefunden") + return MemoryOut.from_point(s.get(point_id)) + + +@app.get("/memory/{point_id}/attachments/{filename}") +def memory_attachments_get(point_id: str, filename: str): + """Liefert die Bytes eines Anhangs. Diagnostic-Server kann das + durchproxien zur Vorschau/Download in der UI.""" + import memory_attachments as mem_att + import mimetypes as _mt + data = mem_att.read_bytes(point_id, filename) + if data is None: + raise HTTPException(404, f"Anhang {filename} nicht gefunden") + mime = _mt.guess_type(filename)[0] or "application/octet-stream" + return Response(content=data, media_type=mime) + + # ─── Migration aus brain-import/ ────────────────────────────────────── IMPORT_DIR = os.environ.get("IMPORT_DIR", "/import") diff --git a/aria-brain/memory/vector_store.py b/aria-brain/memory/vector_store.py index 6e96af3..3a575ac 100644 --- a/aria-brain/memory/vector_store.py +++ b/aria-brain/memory/vector_store.py @@ -60,6 +60,11 @@ class MemoryPoint: updated_at: str = "" conversation_id: Optional[str] = None score: Optional[float] = None # nur bei Search gesetzt + # Anhaenge: Liste von Dicts {name, mime, size, path} — Dateien liegen + # physisch unter /shared/memory-attachments//. + # Hier in der DB nur die Metadaten, damit die Suche/Anzeige sie kennt + # ohne Filesystem zu pruefen. + attachments: List[dict] = field(default_factory=list) def to_payload(self) -> dict: p = { @@ -72,6 +77,7 @@ class MemoryPoint: "tags": self.tags, "created_at": self.created_at, "updated_at": self.updated_at, + "attachments": self.attachments, } if self.conversation_id: p["conversation_id"] = self.conversation_id @@ -92,6 +98,7 @@ class MemoryPoint: created_at=payload.get("created_at", ""), updated_at=payload.get("updated_at", ""), conversation_id=payload.get("conversation_id"), + attachments=payload.get("attachments", []) or [], score=getattr(point, "score", None), ) diff --git a/aria-brain/memory_attachments.py b/aria-brain/memory_attachments.py new file mode 100644 index 0000000..164956c --- /dev/null +++ b/aria-brain/memory_attachments.py @@ -0,0 +1,137 @@ +""" +Anhaenge fuer Memory-Eintraege. + +Storage-Layout: + /shared/memory-attachments// + +Eine flache Ordnerstruktur pro Memory — bei Memory-Delete loescht main.py +das ganze Verzeichnis. Anhang-Metadaten (name, mime, size, path) liegen +zusaetzlich im Qdrant-Payload des Memory-Punkts damit die Listen/Suche +sie ohne Filesystem-Lookup zeigen kann. + +Anhaenge sind erstmal nur ueber die Diagnostic-UI hochladbar — ARIA +selbst hat in Stufe A kein Tool zum Upload. +""" + +from __future__ import annotations + +import base64 +import logging +import mimetypes +import os +import re +import shutil +from pathlib import Path +from typing import List, Optional + +logger = logging.getLogger(__name__) + +ROOT = Path(os.environ.get("MEMORY_ATTACHMENTS_DIR", "/shared/memory-attachments")) +MAX_BYTES = int(os.environ.get("MEMORY_ATTACHMENT_MAX_BYTES", str(20 * 1024 * 1024))) # 20 MB +SAFE_NAME_RE = re.compile(r"[^A-Za-z0-9._\-]") + + +def _safe_filename(name: str) -> str: + """Macht aus einem User-Namen einen filesystem-sicheren String — + zerlegt Pfadteile, schneidet Sonderzeichen weg, kuerzt auf 120 Zeichen.""" + base = Path(name).name or "datei" + base = SAFE_NAME_RE.sub("_", base).strip("._-") or "datei" + return base[:120] + + +def memory_dir(memory_id: str) -> Path: + return ROOT / memory_id + + +def list_attachments(memory_id: str) -> List[dict]: + """Liest die Anhaenge fuer eine Memory aus dem Filesystem. + Returns [{name, mime, size, path}, ...] — leer wenn nichts da. + Source of Truth ist Qdrant-Payload; diese Funktion ist nur fuer + Diagnostic-Endpoints wenn Stefan direkt das FS prueft.""" + d = memory_dir(memory_id) + if not d.is_dir(): + return [] + out = [] + for f in sorted(d.iterdir()): + if not f.is_file(): + continue + out.append(_file_meta(memory_id, f)) + return out + + +def _file_meta(memory_id: str, f: Path) -> dict: + try: + size = f.stat().st_size + except Exception: + size = 0 + mime = mimetypes.guess_type(f.name)[0] or "application/octet-stream" + return { + "name": f.name, + "mime": mime, + "size": size, + "path": str(f), # absoluter Pfad im Container + } + + +def save_attachment(memory_id: str, filename: str, data: bytes) -> dict: + """Schreibt einen Anhang ins FS und gibt seine Metadaten zurueck. + Ueberschreibt eine bestehende Datei mit gleichem Namen.""" + if not memory_id: + raise ValueError("memory_id ist Pflicht") + if len(data) > MAX_BYTES: + raise ValueError(f"Anhang zu gross ({len(data)} > {MAX_BYTES} Byte)") + safe = _safe_filename(filename) + d = memory_dir(memory_id) + d.mkdir(parents=True, exist_ok=True) + target = d / safe + target.write_bytes(data) + logger.info("[mem-att] %s -> %s (%d Byte)", memory_id, safe, len(data)) + return _file_meta(memory_id, target) + + +def save_from_base64(memory_id: str, filename: str, b64: str) -> dict: + """Convenience fuer Base64-Uploads (Diagnostic schickt Files so).""" + try: + data = base64.b64decode(b64, validate=False) + except Exception as exc: + raise ValueError(f"Base64-Decode fehlgeschlagen: {exc}") from exc + return save_attachment(memory_id, filename, data) + + +def delete_attachment(memory_id: str, filename: str) -> bool: + """Loescht eine einzelne Anhang-Datei. Returns True wenn was weg ist.""" + safe = _safe_filename(filename) + target = memory_dir(memory_id) / safe + if not target.is_file(): + return False + try: + target.unlink() + logger.info("[mem-att] %s/%s geloescht", memory_id, safe) + return True + except Exception as exc: + logger.warning("[mem-att] Loeschen fehlgeschlagen: %s", exc) + return False + + +def delete_all(memory_id: str) -> int: + """Loescht das komplette Memory-Verzeichnis. Wird beim Memory-Delete + in main.py gerufen damit nichts verwaist.""" + d = memory_dir(memory_id) + if not d.is_dir(): + return 0 + count = sum(1 for _ in d.iterdir() if _.is_file()) + try: + shutil.rmtree(d) + logger.info("[mem-att] %s komplett entfernt (%d Files)", memory_id, count) + except Exception as exc: + logger.warning("[mem-att] rmtree fehlgeschlagen: %s", exc) + return count + + +def read_bytes(memory_id: str, filename: str) -> Optional[bytes]: + """Liefert die rohen Bytes einer Datei zurueck — fuer Download/Serve.""" + safe = _safe_filename(filename) + target = memory_dir(memory_id) / safe + if not target.is_file(): + return None + return target.read_bytes()