From 6549fcbce8b44bb6ba80be63fe28bce028b15d8e Mon Sep 17 00:00:00 2001 From: duffyduck Date: Tue, 12 May 2026 16:59:31 +0200 Subject: [PATCH] =?UTF-8?q?feat(brain):=20Volltext-Suche=20zusaetzlich=20z?= =?UTF-8?q?u=20Semantic=20=E2=80=94=20Default=20ist=20jetzt=20Wortlich?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stefan wollte ne richtige Suche statt nur "klingt aehnlich". Beide Modi sind jetzt verfuegbar, Default ist Volltext: - 📝 Wortlich (Substring, case-insensitive ueber Title + Content + Category + Tags) — neuer Endpoint /memory/search-text. Full-Scan via Qdrant scroll, k=50. Findet "cessna" exakt im Content. Bei kleiner DB (<1000 Eintraege) unkritisch performant. - 🧠 Semantisch (Embedder + score_threshold 0.30) — bestehender /memory/search Endpoint. Findet konzeptuell verwandte Eintraege. Diagnostic UI: Dropdown neben dem Suchfeld zum Modus-Wechsel. Info-Banner zeigt klar welcher Modus aktiv ist. Warum Wortlich Default: bei kleiner DB liefert Semantic gern False Positives mit Score 0.30-0.45 fuer komplett unverwandte Begriffe (z.B. "cessna" matched "Tageslog fuehren" mit 0.43). Wortlich ist deterministisch und vermeidet das Rauschen. Co-Authored-By: Claude Opus 4.7 (1M context) --- aria-brain/main.py | 17 ++++++++++ aria-brain/memory/vector_store.py | 53 +++++++++++++++++++++++++++++++ diagnostic/index.html | 36 +++++++++++++++------ 3 files changed, 96 insertions(+), 10 deletions(-) diff --git a/aria-brain/main.py b/aria-brain/main.py index e821b9c..3665091 100644 --- a/aria-brain/main.py +++ b/aria-brain/main.py @@ -181,6 +181,23 @@ def memory_pinned(): return [MemoryOut.from_point(p) for p in store().list_pinned()] +@app.get("/memory/search-text", response_model=List[MemoryOut]) +def memory_search_text( + q: str, + k: int = 50, + type: Optional[str] = None, + include_pinned: bool = True, +): + """Volltext-Substring-Suche (case-insensitive) ueber Title + Content + + Category + Tags. Findet exakte Begriffe — z.B. 'cessna' matched 'Cessna 172'. + Im Gegensatz zu /memory/search (semantic) keine 'klingt aehnlich'-Treffer.""" + points = store().search_text( + q, k=k, type_filter=type, + exclude_pinned=not include_pinned, + ) + return [MemoryOut.from_point(p) for p in points] + + @app.get("/memory/search", response_model=List[MemoryOut]) def memory_search( q: str, diff --git a/aria-brain/memory/vector_store.py b/aria-brain/memory/vector_store.py index 859757a..6e96af3 100644 --- a/aria-brain/memory/vector_store.py +++ b/aria-brain/memory/vector_store.py @@ -213,3 +213,56 @@ class VectorStore: def count(self) -> int: return self.client.count(collection_name=COLLECTION, exact=True).count + + def search_text( + self, + query: str, + k: int = 20, + type_filter: Optional[str] = None, + exclude_pinned: bool = False, + ) -> List[MemoryPoint]: + """Volltext-Substring-Suche (case-insensitive) ueber Title + + Content + Category + Tags. Im Gegensatz zu search() ist das KEIN + Semantic-Match — nur exakte Wort-/Teilwort-Treffer. + + Full-Scan ueber alle (gefilteren) Punkte. Bei der erwarteten + Groessenordnung (< 1000) unkritisch.""" + q = (query or "").strip().lower() + if not q: + return [] + must = [] + must_not = [] + if type_filter: + must.append(qm.FieldCondition(key="type", match=qm.MatchValue(value=type_filter))) + if exclude_pinned: + must_not.append(qm.FieldCondition(key="pinned", match=qm.MatchValue(value=True))) + flt = qm.Filter(must=must or None, must_not=must_not or None) if (must or must_not) else None + + matches: List[MemoryPoint] = [] + offset = None + while True: + points, offset = self.client.scroll( + collection_name=COLLECTION, + scroll_filter=flt, + limit=200, + offset=offset, + with_payload=True, + with_vectors=False, + ) + for p in points: + payload = p.payload or {} + tags = payload.get("tags") + tags_str = " ".join(tags) if isinstance(tags, list) else "" + haystack = " ".join([ + str(payload.get("title", "")), + str(payload.get("content", "")), + str(payload.get("category", "")), + tags_str, + ]).lower() + if q in haystack: + matches.append(MemoryPoint.from_qdrant(p)) + if len(matches) >= k: + return matches + if not offset: + break + return matches diff --git a/diagnostic/index.html b/diagnostic/index.html index 8b7a87e..ad4224f 100644 --- a/diagnostic/index.html +++ b/diagnostic/index.html @@ -824,9 +824,15 @@
- +