From 6549fcbce8b44bb6ba80be63fe28bce028b15d8e Mon Sep 17 00:00:00 2001
From: duffyduck <info@hacker-net.de>
Date: Tue, 12 May 2026 16:59:31 +0200
Subject: [PATCH] =?UTF-8?q?feat(brain):=20Volltext-Suche=20zusaetzlich=20z?=
 =?UTF-8?q?u=20Semantic=20=E2=80=94=20Default=20ist=20jetzt=20Wortlich?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stefan wollte ne richtige Suche statt nur "klingt aehnlich". Beide
Modi sind jetzt verfuegbar, Default ist Volltext:

- 📝 Wortlich (Substring, case-insensitive ueber Title + Content +
  Category + Tags) — neuer Endpoint /memory/search-text. Full-Scan
  via Qdrant scroll, k=50. Findet "cessna" exakt im Content. Bei
  kleiner DB (<1000 Eintraege) unkritisch performant.

- 🧠 Semantisch (Embedder + score_threshold 0.30) — bestehender
  /memory/search Endpoint. Findet konzeptuell verwandte Eintraege.

Diagnostic UI: Dropdown neben dem Suchfeld zum Modus-Wechsel.
Info-Banner zeigt klar welcher Modus aktiv ist.

Warum Wortlich Default: bei kleiner DB liefert Semantic gern False
Positives mit Score 0.30-0.45 fuer komplett unverwandte Begriffe
(z.B. "cessna" matched "Tageslog fuehren" mit 0.43). Wortlich ist
deterministisch und vermeidet das Rauschen.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 aria-brain/main.py                | 17 ++++++++++
 aria-brain/memory/vector_store.py | 53 +++++++++++++++++++++++++++++++
 diagnostic/index.html             | 36 +++++++++++++++------
 3 files changed, 96 insertions(+), 10 deletions(-)

diff --git a/aria-brain/main.py b/aria-brain/main.py
index e821b9c..3665091 100644
--- a/aria-brain/main.py
+++ b/aria-brain/main.py
@@ -181,6 +181,23 @@ def memory_pinned():
     return [MemoryOut.from_point(p) for p in store().list_pinned()]
 
 
+@app.get("/memory/search-text", response_model=List[MemoryOut])
+def memory_search_text(
+    q: str,
+    k: int = 50,
+    type: Optional[str] = None,
+    include_pinned: bool = True,
+):
+    """Volltext-Substring-Suche (case-insensitive) ueber Title + Content +
+    Category + Tags. Findet exakte Begriffe — z.B. 'cessna' matched 'Cessna 172'.
+    Im Gegensatz zu /memory/search (semantic) keine 'klingt aehnlich'-Treffer."""
+    points = store().search_text(
+        q, k=k, type_filter=type,
+        exclude_pinned=not include_pinned,
+    )
+    return [MemoryOut.from_point(p) for p in points]
+
+
 @app.get("/memory/search", response_model=List[MemoryOut])
 def memory_search(
     q: str,
diff --git a/aria-brain/memory/vector_store.py b/aria-brain/memory/vector_store.py
index 859757a..6e96af3 100644
--- a/aria-brain/memory/vector_store.py
+++ b/aria-brain/memory/vector_store.py
@@ -213,3 +213,56 @@ class VectorStore:
 
     def count(self) -> int:
         return self.client.count(collection_name=COLLECTION, exact=True).count
+
+    def search_text(
+        self,
+        query: str,
+        k: int = 20,
+        type_filter: Optional[str] = None,
+        exclude_pinned: bool = False,
+    ) -> List[MemoryPoint]:
+        """Volltext-Substring-Suche (case-insensitive) ueber Title +
+        Content + Category + Tags. Im Gegensatz zu search() ist das KEIN
+        Semantic-Match — nur exakte Wort-/Teilwort-Treffer.
+
+        Full-Scan ueber alle (gefilteren) Punkte. Bei der erwarteten
+        Groessenordnung (< 1000) unkritisch."""
+        q = (query or "").strip().lower()
+        if not q:
+            return []
+        must = []
+        must_not = []
+        if type_filter:
+            must.append(qm.FieldCondition(key="type", match=qm.MatchValue(value=type_filter)))
+        if exclude_pinned:
+            must_not.append(qm.FieldCondition(key="pinned", match=qm.MatchValue(value=True)))
+        flt = qm.Filter(must=must or None, must_not=must_not or None) if (must or must_not) else None
+
+        matches: List[MemoryPoint] = []
+        offset = None
+        while True:
+            points, offset = self.client.scroll(
+                collection_name=COLLECTION,
+                scroll_filter=flt,
+                limit=200,
+                offset=offset,
+                with_payload=True,
+                with_vectors=False,
+            )
+            for p in points:
+                payload = p.payload or {}
+                tags = payload.get("tags")
+                tags_str = " ".join(tags) if isinstance(tags, list) else ""
+                haystack = " ".join([
+                    str(payload.get("title", "")),
+                    str(payload.get("content", "")),
+                    str(payload.get("category", "")),
+                    tags_str,
+                ]).lower()
+                if q in haystack:
+                    matches.append(MemoryPoint.from_qdrant(p))
+                    if len(matches) >= k:
+                        return matches
+            if not offset:
+                break
+        return matches
diff --git a/diagnostic/index.html b/diagnostic/index.html
index 8b7a87e..ad4224f 100644
--- a/diagnostic/index.html
+++ b/diagnostic/index.html
@@ -824,9 +824,15 @@
       </div>
       <div class="card" style="margin-bottom:8px;">
         <div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center;">
-          <input type="text" id="brain-search" placeholder="Semantische Suche (z.B. 'Stefan Persönlichkeit')..."
+          <input type="text" id="brain-search" placeholder="Suche (z.B. 'cessna' oder 'Stefan Persönlichkeit')..."
                  style="flex:1;min-width:200px;background:#080810;color:#E0E0F0;border:1px solid #1E1E2E;padding:6px 8px;border-radius:4px;font-family:inherit;font-size:12px;"
                  onkeydown="if(event.key==='Enter') runBrainSearch()">
+          <select id="brain-search-mode" onchange="if(document.getElementById('brain-search').value.trim()) runBrainSearch()"
+                  title="Wortlich = exakter Substring-Match. Semantisch = 'klingt aehnlich' via Embeddings."
+                  style="background:#080810;color:#E0E0F0;border:1px solid #1E1E2E;padding:6px;border-radius:4px;font-family:inherit;font-size:11px;">
+            <option value="text" selected>📝 Wortlich</option>
+            <option value="semantic">🧠 Semantisch</option>
+          </select>
           <button class="btn secondary" onclick="runBrainSearch()" style="padding:4px 12px;font-size:11px;">Suchen</button>
           <select id="brain-filter-type" onchange="loadBrainMemoryList()"
                   style="background:#080810;color:#E0E0F0;border:1px solid #1E1E2E;padding:6px;border-radius:4px;font-family:inherit;font-size:11px;">
@@ -3457,13 +3463,23 @@
         return;
       }
       const typeFilter = document.getElementById('brain-filter-type').value;
-      // k=10 + Score-Threshold im Backend (0.30) → nur relevante Treffer.
-      // Frueher k=20 ohne Threshold: bei kleiner DB landete fast alles
-      // als "Treffer", egal wie unaehnlich.
-      const params = new URLSearchParams({ q, k: '10', include_pinned: 'true', score_threshold: '0.30' });
-      if (typeFilter) params.set('type', typeFilter);
+      const mode = (document.getElementById('brain-search-mode')?.value) || 'text';
+      let url, modeLabel;
+      if (mode === 'semantic') {
+        // Embedder-basiert, mit Score-Threshold gegen Rauschen
+        const params = new URLSearchParams({ q, k: '10', include_pinned: 'true', score_threshold: '0.30' });
+        if (typeFilter) params.set('type', typeFilter);
+        url = '/api/brain/memory/search?' + params.toString();
+        modeLabel = '🧠 semantisch (Score ≥ 0.30)';
+      } else {
+        // Volltext-Substring (case-insensitive) — findet exakte Begriffe
+        const params = new URLSearchParams({ q, k: '50', include_pinned: 'true' });
+        if (typeFilter) params.set('type', typeFilter);
+        url = '/api/brain/memory/search-text?' + params.toString();
+        modeLabel = '📝 wortlich (Substring)';
+      }
       try {
-        const r = await fetch('/api/brain/memory/search?' + params.toString());
+        const r = await fetch(url);
         if (!r.ok) throw new Error('HTTP ' + r.status);
         const hits = await r.json();
         hits.forEach(m => { brainMemoryCache[m.id] = m; });
@@ -3471,13 +3487,13 @@
         if (info) {
           info.style.display = 'block';
           if (hits.length === 0) {
-            info.innerHTML = `🔍 Keine relevanten Treffer für "${escapeHtml(q)}"` +
+            info.innerHTML = `🔍 Keine Treffer für "${escapeHtml(q)}"` +
               (typeFilter ? ` · Typ=${escapeHtml(typeFilter)}` : '') +
-              ` (Score < 0.30). Versuche andere Begriffe oder klicke das ✕ rechts um die Suche zu schliessen.`;
+              ` · ${modeLabel}. Anderen Begriff probieren oder ✕ rechts um Suche zu schliessen.`;
           } else {
             info.innerHTML = `🔍 ${hits.length} Treffer für "${escapeHtml(q)}"` +
               (typeFilter ? ` · Typ=${escapeHtml(typeFilter)}` : '') +
-              ` · sortiert nach Aehnlichkeit (Score &ge; 0.30)`;
+              ` · ${modeLabel}`;
           }
         }
         renderBrainList(hits, true);