Compare commits
293 Commits
aafdbcd57a
...
v0.1.2.2
| Author | SHA1 | Date | |
|---|---|---|---|
| 7b55d6a91f | |||
| aa077f60e6 | |||
| 094bd6e4f1 | |||
| 5b0b5eeac6 | |||
| 96a1f90ac3 | |||
| bfa06d78a7 | |||
| d16dcd34cc | |||
| dc2f4eb6d2 | |||
| 0f9a029269 | |||
| 70d1500096 | |||
| d0cb7acd10 | |||
| 0b58feee1e | |||
| 8be34e7284 | |||
| b56cef6298 | |||
| 0d203af8fb | |||
| 0468d0e603 | |||
| 7cfc2ba058 | |||
| da795d14f5 | |||
| d60c7e9110 | |||
| 83c99a5e65 | |||
| e438bb11ff | |||
| 8b4f75bf91 | |||
| d7e7386954 | |||
| 2100c64b91 | |||
| 74ebf59c6f | |||
| 53b49eacad | |||
| 0f11d23c75 | |||
| 311030bdaa | |||
| 1e05c66baa | |||
| 4082a6bf2a | |||
| 3485642b3e | |||
| 1240ae3829 | |||
| 2dd4d38dce | |||
| 7f862ce1f4 | |||
| 528fe97b59 | |||
| 3483d1bfce | |||
| 158423c155 | |||
| 087e91dca1 | |||
| 2de4cbc00f | |||
| 03fc465057 | |||
| b696b47feb | |||
| 6aae565541 | |||
| 214bd218a0 | |||
| 2afeee29ee | |||
| c8dee4c416 | |||
| f49f3c3b08 | |||
| c4bbb06710 | |||
| 4411cc4fff | |||
| 24a91887ef | |||
| 4e62b2919f | |||
| fa774156fe | |||
| 3b19f05c5b | |||
| fc3ecaacca | |||
| 08857093b5 | |||
| 62018b3e51 | |||
| 89e3a195a3 | |||
| f023ba0ac5 | |||
| a0570ef8f7 | |||
| facde1fef7 | |||
| 38106a2096 | |||
| a476afb311 | |||
| db4c7b9b72 | |||
| 3bc490b485 | |||
| dd6d70c46e | |||
| b1eaf42fef | |||
| fb9e5dcd10 | |||
| f95e71463f | |||
| 1088bff43d | |||
| cad68db2a2 | |||
| 50b10c8ac0 | |||
| a8b586ec92 | |||
| 632e1e4fa1 | |||
| 7e12816ebd | |||
| 8f64f8fb30 | |||
| b3ff3991c4 | |||
| a4ea387c98 | |||
| 68fbf74a23 | |||
| b857f778e9 | |||
| 31aa82b68c | |||
| de8eeb69e2 | |||
| f5970ce700 | |||
| ef1a4436ca | |||
| 981779cd9e | |||
| 3dcd2ae0b4 | |||
| 2750b867a3 | |||
| f6424add6c | |||
| 2dfd21d1d0 | |||
| 9d9ddc730b | |||
| 77ccee8331 | |||
| 175dcdf225 | |||
| 1549e9cd4f | |||
| 910e74b497 | |||
| 160c5c34b6 | |||
| a6638c0108 | |||
| 43c21d3ddc | |||
| b73c6c346e | |||
| b91ddc5bdf | |||
| 7d08c06720 | |||
| f066a2a555 | |||
| b55b0e7c42 | |||
| 70f806ef80 | |||
| 0773d9496d | |||
| 1a4857ed62 | |||
| 962d814318 | |||
| 9276a92c83 | |||
| d16896c4b4 | |||
| 20050d4077 | |||
| 79760d1b2e | |||
| 13f1103604 | |||
| 73b7a76ea8 | |||
| 17f3d8870e | |||
| 4feaacc7e4 | |||
| af7b2674f3 | |||
| 97442198ec | |||
| e3e841f2ab | |||
| 33185de42b | |||
| dbe547d4ea | |||
| 1a982c0d45 | |||
| dfba5ceb1f | |||
| 1a6f633836 | |||
| 7f7db100af | |||
| d646e9d58e | |||
| bef59ba134 | |||
| dbebfd44ff | |||
| 4d0b9e0d78 | |||
| 0c43a18402 | |||
| 5bdcc3c65b | |||
| 52795530f9 | |||
| 2eb0b4df90 | |||
| 0c18090351 | |||
| d6b54d3247 | |||
| ead28cf09a | |||
| f682aad4ff | |||
| e0c1a4bcd5 | |||
| a648dad96d | |||
| da5579038e | |||
| 4ba48940b9 | |||
| 568ef9ed10 | |||
| 7682a0ce58 | |||
| 3ca834e633 | |||
| 55ef207454 | |||
| 6651f5937d | |||
| e9e7dd804f | |||
| ec9530f17f | |||
| 97cb7be313 | |||
| 77e927ffcd | |||
| a9a87f12df | |||
| 2a56ac0290 | |||
| edc65ce645 | |||
| d7efaf93b3 | |||
| 31ff20c846 | |||
| 406f4cb3cc | |||
| fa0667088a | |||
| f55329706e | |||
| 6c7fd1d0e3 | |||
| 9d8db111ac | |||
| 482cb6ace3 | |||
| 69c1c49a7d | |||
| b1ccf29295 | |||
| 4cd9faece2 | |||
| fec8aa977b | |||
| 20123de827 | |||
| 8761d1a1b7 | |||
| abc5b971f4 | |||
| b588dd7e3b | |||
| 309df9d851 | |||
| f2e643d1fb | |||
| 6ac374621c | |||
| efbd306597 | |||
| 4454613a98 | |||
| 55cfb752a2 | |||
| a4d3449e3a | |||
| 44d2c6b4fe | |||
| 0309c95aa5 | |||
| 2aa2cc70c9 | |||
| 9d0776c819 | |||
| f031fa159e | |||
| be373466a3 | |||
| bbf9aed3ba | |||
| 745b4a07c0 | |||
| 23ca815cb2 | |||
| cc3fac8142 | |||
| cd89e36ec2 | |||
| f5b4285d15 | |||
| 248e7c9ae4 | |||
| 7058cc8d8d | |||
| 7919489543 | |||
| feac7f2479 | |||
| b80b813703 | |||
| e7bb6c37cb | |||
| d146ca92c4 | |||
| fd95af2c40 | |||
| 9e12e0001c | |||
| 1d34143be5 | |||
| 0fc11e33c8 | |||
| dae603541b | |||
| 87b4cd305c | |||
| 190352820c | |||
| 2264f4e3bc | |||
| 58fd8721e3 | |||
| 4f494daffb | |||
| 958c8d6fc6 | |||
| 5ba89c7191 | |||
| b373f915b5 | |||
| 7748834a0f | |||
| 8b52f4c92b | |||
| dc20570f6d | |||
| 744a27cfd1 | |||
| 37c5f6c368 | |||
| a361015ff4 | |||
| d83b555209 | |||
| a029267d9d | |||
| 8ba6a71a49 | |||
| 2f625572fc | |||
| ac56916eb0 | |||
| ae08a5051c | |||
| d372cd638e | |||
| 60c5cb7e59 | |||
| 607a4c9ff8 | |||
| 4ea16cfa8f | |||
| 6ce9880bc0 | |||
| 187ffad7ee | |||
| 467f95424e | |||
| c1a5518fb7 | |||
| 22fa4b3ccf | |||
| 1b8a51aad0 | |||
| 578ade3544 | |||
| ed2f1bb5ee | |||
| 0a04972455 | |||
| 2a4379eb64 | |||
| e64df23bb7 | |||
| 576ae925dd | |||
| e170991222 | |||
| a1343ee18f | |||
| b2d3c935d8 | |||
| 49089eee4b | |||
| e544992c9f | |||
| 97a1a3089a | |||
| 64f18e97a0 | |||
| 9cbea27455 | |||
| c8881f9e4d | |||
| 028e3b2240 | |||
| c042f27106 | |||
| 4ceadf8be5 | |||
| ddd30b3059 | |||
| 6c8ba5fe2d | |||
| 32ddac002f | |||
| bbbe69d928 | |||
| 23c39d5bba | |||
| 5328dc8595 | |||
| 0c03b4f161 | |||
| 31fe70bab5 | |||
| 39251b3d32 | |||
| 0623de32a0 | |||
| cd5e6e7ee6 | |||
| ee3e0a0af6 | |||
| 0783b1b99d | |||
| 5492c7a46f | |||
| 4cbe184faa | |||
| 647a1cb726 | |||
| 73263b69a6 | |||
| c62ceafdc2 | |||
| 9b5a35cb4a | |||
| 5ac1a0a522 | |||
| a28b46a809 | |||
| 59c8d36a3d | |||
| 79ba7b8487 | |||
| ba62cec78c | |||
| f15b3f583f | |||
| 402bddc18a | |||
| 350069d371 | |||
| 019c078393 | |||
| d411df4074 | |||
| 763e0d79ab | |||
| 47fe4ad655 | |||
| 99cb83202e | |||
| fc2438be2d | |||
| 40e48b046b | |||
| f801d99748 | |||
| 6ab6196739 | |||
| eb12281dfc | |||
| 1fb1fdef9e | |||
| 593d26e0ff | |||
| 394abb58be | |||
| fc3bee6d05 | |||
| b203503fd8 | |||
| 8b0a72dc9b | |||
| 23add7a107 | |||
| caf84196fb | |||
| 099b9651a6 | |||
| 76d72a1eef | |||
| 87deede078 | |||
| 6fec8588c1 |
+11
-8
@@ -9,15 +9,19 @@
|
||||
.env.*
|
||||
!.env.example
|
||||
!.env.*.example
|
||||
aria-data/config/*.env
|
||||
!aria-data/config/*.env.example
|
||||
!aria-data/config/openclaw.env
|
||||
|
||||
# ── ARIAs Gedächtnis (nur per tar gesichert) ────
|
||||
aria-data/brain/
|
||||
# Privater User-Profile-Snippet (Tool-Stack, interne URLs) —
|
||||
# liegt jetzt in brain-import/ (frueher aria-data/config/USER.md).
|
||||
# USER.md.example ist Repo-Inhalt, USER.md lokal selbst anlegen.
|
||||
aria-data/brain-import/USER.md
|
||||
|
||||
# ── Stimmen (große Binärdateien) ─────────────────
|
||||
aria-data/voices/
|
||||
# ── ARIAs Gedächtnis (Vector-DB, Skills, Models) ──
|
||||
# Backup via Diagnostic → Gehirn-Export (tar.gz), nicht via Git.
|
||||
aria-data/brain/data/
|
||||
aria-data/brain/qdrant/
|
||||
|
||||
# Diagnostic-State (aktive Session etc.)
|
||||
aria-data/config/diag-state/
|
||||
|
||||
# ── Node / npm ──────────────────────────────────
|
||||
node_modules/
|
||||
@@ -46,7 +50,6 @@ desktop/dist/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
bridge/__pycache__/
|
||||
|
||||
# ── macOS ────────────────────────────────────────
|
||||
.DS_Store
|
||||
|
||||
Binary file not shown.
@@ -34,13 +34,21 @@ ARIA hat zwei Rollen:
|
||||
└───────────┬───────────────────────────┬─────────────────┘
|
||||
│ WebSocket Tunnel │ WebSocket Tunnel
|
||||
▼ ▼
|
||||
┌───────────────────────────┐
|
||||
│ Gaming-PC (optional) │
|
||||
│ RTX 3060, Docker+WSL2 │
|
||||
│ XTTS v2 (natuerliche │
|
||||
│ Stimmen, Voice Cloning) │
|
||||
│ xtts/docker-compose.yml │
|
||||
└───────────────────────────┘
|
||||
┌─────────────────────────────────┐
|
||||
│ Gamebox (Windows + WSL2) │
|
||||
│ RTX 3060, Docker Desktop │
|
||||
│ ┌──────────────────────────┐ │
|
||||
│ │ aria-f5tts-bridge │ │
|
||||
│ │ F5-TTS Voice Cloning │ │
|
||||
│ │ PCM-Streaming an die App │ │
|
||||
│ ├──────────────────────────┤ │
|
||||
│ │ aria-whisper-bridge │ │
|
||||
│ │ Faster-Whisper CUDA │ │
|
||||
│ │ STT in fast-Echtzeit │ │
|
||||
│ └──────────────────────────┘ │
|
||||
│ Beide teilen ./voices Volume │
|
||||
│ xtts/docker-compose.yml │
|
||||
└─────────────────────────────────┘
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ ARIA-VM (Proxmox, Debian 13) — ARIAs Wohnung │
|
||||
│ Basissystem + Docker. Rest richtet ARIA selbst ein. │
|
||||
@@ -49,39 +57,50 @@ ARIA hat zwei Rollen:
|
||||
│ ┌─────────────────────────────────────────────────┐ │
|
||||
│ │ [proxy] claude-max-api-proxy Container │ │
|
||||
│ │ Claude Max Sub → lokale API │ │
|
||||
│ │ Port 3456, mit sed-Patches fuer │ │
|
||||
│ │ Tool-Permissions + Host-Binding │ │
|
||||
│ │ │ │
|
||||
│ │ [aria] OpenClaw Container (aria-core) │ │
|
||||
│ │ Gateway, Sessions, Memory, Skills │ │
|
||||
│ │ Liest BOOTSTRAP.md + AGENT.md │ │
|
||||
│ │ [qdrant] Vector-DB fuer ARIAs Gedaechtnis │ │
|
||||
│ │ Bind-Mount: aria-data/brain/qdrant/ │ │
|
||||
│ │ │ │
|
||||
│ │ [brain] ARIA Agent + Memory Container │ │
|
||||
│ │ FastAPI auf Port 8080 │ │
|
||||
│ │ Eigener Agent-Loop, Skills, │ │
|
||||
│ │ Vector-Memory, SSH-Zugriff zur VM │ │
|
||||
│ │ Bind-Mount: aria-data/brain/data/ │ │
|
||||
│ │ │ │
|
||||
│ │ [bridge] ARIA Voice Bridge Container │ │
|
||||
│ │ Whisper STT · Piper TTS · Wake-Word │ │
|
||||
│ │ Ramona (weiblich) + Thorsten (tief) │ │
|
||||
│ │ Bruecke: App <> RVS <> Bridge <> ARIA │ │
|
||||
│ │ Wake-Word, STT, TTS-Forwarding │ │
|
||||
│ │ Spricht mit Brain via HTTP/8080 │ │
|
||||
│ │ │ │
|
||||
│ │ [diagnostic] Selbstcheck-UI + Einstellungen │ │
|
||||
│ │ Gateway + RVS + Proxy Status │ │
|
||||
│ │ Chat, Sessions, Login, Logs │ │
|
||||
│ │ Port 3001 (im Netzwerk der Bridge) │ │
|
||||
│ │ Chat, Gehirn, Dateien, Logs │ │
|
||||
│ └──────────────────┬──────────────────────────────┘ │
|
||||
│ │ Volume Mount │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────┐ │
|
||||
│ │ ./aria-data/ — Ein tar = vollstaendiges Backup │ │
|
||||
│ │ ./aria-data/ — Konfiguration + SSH-Keys │ │
|
||||
│ │ ./aria-data/brain/ — Vector-DB + Skills (gitignored)│
|
||||
│ │ Backup via Diagnostic → "Gehirn-Export" (tar.gz) │ │
|
||||
│ └─────────────────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
> OpenClaw (frueher `aria-core`) ist abgerissen — ARIA laeuft jetzt mit eigenem Agent-Framework im
|
||||
> `aria-brain` Container. Eigene Tools, Skills, Vector-Memory statt Sessions. Letzter OpenClaw-Stand
|
||||
> ist als Git-Tag `v0.1.2.0` archiviert.
|
||||
|
||||
**Vier separate Deployments:**
|
||||
|
||||
| Was | Wo | Wie |
|
||||
|-----|----|-----|
|
||||
| RVS | Rechenzentrum | `cd rvs && docker compose up -d` |
|
||||
| ARIA Core | Debian 13 VM | `docker compose up -d && ./aria-setup.sh` |
|
||||
| XTTS v2 (optional) | Gaming-PC (GPU) | `cd xtts && docker compose up -d` |
|
||||
| ARIA Brain/Bridge/Diagnostic | Debian 13 VM | `./init.sh && ./aria-setup.sh && docker compose up -d` |
|
||||
| Gamebox-Stack (F5-TTS + Whisper) | Gamebox (GPU) | `cd xtts && docker compose up -d` |
|
||||
| Android App | Stefans Handy | APK installieren (Auto-Update via RVS) |
|
||||
|
||||
> Der Gamebox-Stack ist optional: ohne ihn faellt STT auf lokales Whisper (CPU,
|
||||
> langsamer) zurueck; TTS bleibt aus (ARIA antwortet dann nur als Text).
|
||||
|
||||
---
|
||||
|
||||
## Installation — Schritt fuer Schritt
|
||||
@@ -101,12 +120,12 @@ apt install -y docker.io docker-compose-plugin git curl jq
|
||||
git clone git@gitea.hackersoft.de:aria/aria.git ~/ARIA-AGENT
|
||||
cd ~/ARIA-AGENT
|
||||
cp .env.example .env
|
||||
bash init.sh # legt USER.md aus Vorlage an (idempotent, schadet nicht)
|
||||
```
|
||||
|
||||
`.env` Datei editieren (Details siehe `.env.example`):
|
||||
```bash
|
||||
# Gateway-Auth: Alle Services die mit aria-core reden brauchen diesen Token
|
||||
# Diagnostic, Bridge, App nutzen ihn fuer den WebSocket-Handshake
|
||||
# Auth-Token: Alle ARIA-Services nutzen ihn fuer interne Auth
|
||||
ARIA_AUTH_TOKEN= # openssl rand -hex 32
|
||||
|
||||
# RVS-Verbindung: Hostname + Port deines Rendezvous-Servers
|
||||
@@ -115,17 +134,18 @@ RVS_PORT=443
|
||||
RVS_TLS=true
|
||||
RVS_TLS_FALLBACK=true
|
||||
|
||||
# Pairing-Token: Verbindet App, Bridge, Diagnostic und XTTS im gleichen RVS-Room
|
||||
# Pairing-Token: Verbindet App, Bridge, Diagnostic und Gamebox im gleichen RVS-Room
|
||||
# MUSS auf allen Geraeten identisch sein (ARIA-VM, Gaming-PC, App)
|
||||
# Wird von generate-token.sh automatisch generiert und eingetragen
|
||||
RVS_TOKEN= # ./generate-token.sh
|
||||
|
||||
# Optional: SSH-Host des RVS-Servers fuer Auto-Update (z.B. root@aria-rvs)
|
||||
RVS_UPDATE_HOST=
|
||||
```
|
||||
|
||||
Alle anderen Einstellungen (Stimmen, Modi, Wake-Word, F5-TTS-Tuning) leben in
|
||||
`/shared/config/runtime.json` und werden ueber die Diagnostic-UI gepflegt — nicht
|
||||
in der `.env`. Komplett-Reset jederzeit moeglich via "🗑 ALLES löschen" im
|
||||
Diagnostic-Einstellungen-Tab.
|
||||
|
||||
**Zwei Tokens, zwei Zwecke:**
|
||||
- **ARIA_AUTH_TOKEN**: Authentifizierung am OpenClaw Gateway (aria-core). Wer diesen Token hat, kann ARIA Befehle geben.
|
||||
- **ARIA_AUTH_TOKEN**: Interner Auth-Token zwischen ARIAs Containern.
|
||||
- **RVS_TOKEN**: Pairing-Token fuer den Rendezvous-Server. Alle Geraete mit dem gleichen Token landen im gleichen "Room" und koennen kommunizieren. Die App bekommt diesen Token per QR-Code.
|
||||
|
||||
### 2. Claude CLI einloggen (Proxy-Auth)
|
||||
@@ -143,52 +163,24 @@ claude login
|
||||
**Wichtig:** Der Ordner `~/.claude/` (nicht `~/.config/claude/`!) wird als Volume
|
||||
in den Proxy gemountet. Die Credentials ueberleben Container-Restarts.
|
||||
|
||||
### 3. Stimmen herunterladen
|
||||
### 3. SSH-Key fuer aria-wohnung generieren + RVS-Token + Container
|
||||
|
||||
```bash
|
||||
./get-voices.sh
|
||||
# Laedt Ramona + Thorsten (Piper TTS) nach aria-data/voices/
|
||||
# Ca. 100MB, dauert ein paar Minuten
|
||||
```
|
||||
# SSH-Key fuer den Zugriff von ARIA auf die VM (aria-wohnung)
|
||||
./aria-setup.sh
|
||||
|
||||
### 4. Voice Bridge konfigurieren
|
||||
|
||||
```bash
|
||||
cp aria-data/config/aria.env.example aria-data/config/aria.env
|
||||
# Bei Bedarf anpassen (Whisper-Modell, Sprache, Stimmen-Pfade)
|
||||
```
|
||||
|
||||
### 5. RVS-Token generieren & Container starten
|
||||
|
||||
```bash
|
||||
# Token generieren — schreibt RVS_TOKEN in .env, zeigt QR-Code
|
||||
# RVS-Token generieren — schreibt RVS_TOKEN in .env, zeigt QR-Code
|
||||
./generate-token.sh
|
||||
|
||||
# Alle Container starten
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### 6. ARIA Setup ausfuehren (einmalig!)
|
||||
`aria-setup.sh` generiert den ed25519-Key in `aria-data/ssh/` und traegt den
|
||||
Public-Key in `/root/.ssh/authorized_keys` ein (Script laeuft als root auf der VM
|
||||
aria-wohnung). Brain + Proxy nutzen den gleichen Key.
|
||||
|
||||
```bash
|
||||
./aria-setup.sh
|
||||
```
|
||||
|
||||
Dieses Script ist **essentiell** — es macht:
|
||||
1. Wartet bis aria-core laeuft
|
||||
2. Fixt Volume-Permissions (Docker → node User)
|
||||
3. Schreibt `openclaw.json` (Proxy-Provider, Model-Config, Timeout 900s)
|
||||
4. Setzt exec-approvals Wildcard (Tool-Ausfuehrung im headless-Modus)
|
||||
5. Generiert SSH-Key fuer VM-Zugriff (`aria-data/ssh/`)
|
||||
6. Fixt SSH-Permissions im Container
|
||||
7. Startet aria-core neu
|
||||
|
||||
**SSH-Key auf der VM eintragen** (wird vom Script angezeigt):
|
||||
```bash
|
||||
cat ~/ARIA-AGENT/aria-data/ssh/id_ed25519.pub >> /root/.ssh/authorized_keys
|
||||
```
|
||||
|
||||
### 7. App verbinden
|
||||
### 4. App verbinden
|
||||
|
||||
App oeffnen → QR-Code scannen → "ARIA, hoerst du mich?"
|
||||
|
||||
@@ -196,20 +188,19 @@ Der QR-Code enthaelt: Host, Port, Token, TLS-Flag — einmal scannen, nie wieder
|
||||
|
||||
Bestehendes Token nochmal als QR anzeigen: `./generate-token.sh show`
|
||||
|
||||
### 8. Diagnostic pruefen
|
||||
### 5. Diagnostic pruefen
|
||||
|
||||
```bash
|
||||
# Im Browser:
|
||||
http://<VM-IP>:3001
|
||||
```
|
||||
|
||||
Die Diagnostic-UI zeigt:
|
||||
- Gateway-Verbindung (gruener Punkt = OK)
|
||||
- RVS-Verbindung
|
||||
- Proxy-Status + Claude Login
|
||||
- Chat-Test (direkt an ARIA schreiben)
|
||||
- Session-Verwaltung
|
||||
- Container-Logs
|
||||
Die Diagnostic-UI hat vier Top-Tabs:
|
||||
|
||||
- **Main** — Live-Chat-Test, Status (Brain / RVS / Proxy), End-to-End-Trace
|
||||
- **Gehirn** — Memory-Verwaltung (Vector-DB), Skills, Export/Import des kompletten Gehirns als tar.gz
|
||||
- **Dateien** — alle Dateien aus `/shared/uploads/` (von ARIA generiert oder hochgeladen) mit Download/Delete
|
||||
- **Einstellungen** — Reparatur (Container-Restart), Wipe, Sprachausgabe, Whisper, Runtime-Config, App-Onboarding (QR), Komplett-Reset
|
||||
|
||||
---
|
||||
|
||||
@@ -217,7 +208,7 @@ Die Diagnostic-UI zeigt:
|
||||
|
||||
Der Proxy ist das Herzsttueck: Er macht aus der Claude Max Subscription eine lokale API.
|
||||
|
||||
**Ablauf:** `OpenClaw (aria-core) → HTTP → claude-max-api-proxy → Claude Code CLI (--print) → Anthropic API`
|
||||
**Ablauf:** `aria-brain → HTTP → claude-max-api-proxy → Claude Code CLI (--print) → Anthropic API`
|
||||
|
||||
Der Proxy-Container (`node:22-alpine`) installiert bei jedem Start:
|
||||
- `@anthropic-ai/claude-code` — Claude Code CLI
|
||||
@@ -238,78 +229,67 @@ Danach werden per `sed` vier Patches angewendet:
|
||||
|
||||
## Konfigurationsdateien
|
||||
|
||||
### aria-data/config/
|
||||
### aria-data/
|
||||
|
||||
| Datei | Zweck | Gemountet als |
|
||||
|-------|-------|---------------|
|
||||
| `BOOTSTRAP.md` | ARIAs System-Prompt: Identitaet, Sicherheitsregeln, Tool-Freigaben, Infrastruktur | `BOOTSTRAP.md` + `CLAUDE.md` im Workspace |
|
||||
| `AGENT.md` | ARIAs Persoenlichkeit, Tool-Freigaben, Arbeitsprinzipien | `AGENT.md` im Workspace |
|
||||
| `USER.md` | Stefans Praeferenzen, Kommunikationsstil | `USER.md` im Workspace |
|
||||
| `openclaw.env` | OpenClaw Container-Environment | `.env` im Workspace |
|
||||
| `aria.env` | Voice Bridge Konfiguration (Whisper, Stimmen) | `/config/aria.env` in Bridge |
|
||||
| Pfad | Zweck |
|
||||
|------|-------|
|
||||
| `.env` | Tokens (ARIA_AUTH_TOKEN, RVS_TOKEN, RVS_HOST) — minimal, alles andere lebt in der DB |
|
||||
| `aria-data/ssh/` | SSH-Key fuer den Zugriff auf aria-wohnung (Brain + Proxy teilen den Key) |
|
||||
| `aria-data/brain/qdrant/` | Vector-DB-Storage (Bind-Mount, gitignored) |
|
||||
| `aria-data/brain/data/` | Skills, Embedding-Modell-Cache (Bind-Mount, gitignored) |
|
||||
| `aria-data/brain-import/` | `AGENT.md`, `USER.md.example`, `TOOLING.md.example` — Quelle fuer den initialen Memory-Import in die Vector-DB |
|
||||
| `aria-data/config/diag-state/` | Diagnostic State (z.B. zuletzt aktive Session) |
|
||||
|
||||
**BOOTSTRAP.md** ist die wichtigste Datei — sie definiert:
|
||||
- Wer ARIA ist (Name, Rolle, Persoenlichkeit)
|
||||
- Sicherheitsregeln (kein ClawHub, Prompt Injection abwehren)
|
||||
- Tool-Freigaben (alle Claude Code Tools: WebFetch, Bash, etc.)
|
||||
- SSH-Zugriff auf aria-wohnung (VM)
|
||||
- Stimmen-Auswahl (Ramona vs Thorsten)
|
||||
- Gedaechtnis-System
|
||||
### /shared/config/ (im aria-shared Volume)
|
||||
|
||||
### openclaw.json (via aria-setup.sh)
|
||||
| Datei | Zweck |
|
||||
|-------|-------|
|
||||
| `voice_config.json` | TTS-Engine, geclonte Stimme, Whisper-Modell, F5-TTS-Tuning |
|
||||
| `runtime.json` | Token + RVS-Override + Whisper-Sprache (durch Diagnostic gepflegt) |
|
||||
| `highlight_triggers.json` | Highlight-Trigger-Woerter |
|
||||
| `chat_backup.jsonl` | Append-only Chat-Log (Quelle fuer die Chat-History in Diagnostic) |
|
||||
|
||||
Wird von `aria-setup.sh` in den Container geschrieben:
|
||||
```json
|
||||
{
|
||||
"agents": {
|
||||
"defaults": {
|
||||
"model": { "primary": "proxy/claude-sonnet-4" },
|
||||
"timeoutSeconds": 900,
|
||||
"maxConcurrent": 4
|
||||
}
|
||||
},
|
||||
"models": {
|
||||
"providers": {
|
||||
"proxy": {
|
||||
"api": "openai-completions",
|
||||
"baseUrl": "http://proxy:3456/v1",
|
||||
"apiKey": "not-needed"
|
||||
}
|
||||
}
|
||||
},
|
||||
"tools": { "profile": "full" },
|
||||
"messages": { "ackReactionScope": "all" }
|
||||
}
|
||||
```
|
||||
`voice_config.json` + `highlight_triggers.json` lassen sich via Diagnostic →
|
||||
"Sprachausgabe" als Bundle exportieren/importieren.
|
||||
|
||||
**timeoutSeconds: 900** (15 Min) — notwendig weil jede Anfrage einen neuen
|
||||
`claude --print` Prozess spawnt (Cold Start). Bei Tool-Nutzung (WebFetch, Bash)
|
||||
braucht ARIA mehrere API-Roundtrips.
|
||||
### Backup / Reset
|
||||
|
||||
- **Gehirn-Backup**: Diagnostic → Gehirn → "⬇ Export herunterladen" — komplettes Brain (Memories + Skills + Qdrant-DB) als `.tar.gz`
|
||||
- **Stimmen-Backup**: pro Stimme ein `.tar.gz` (Diagnostic → Sprachausgabe → ⬇ pro Stimme); Import via Upload-Button
|
||||
- **Komplett-Reset**: Diagnostic → Einstellungen → "🗑 ALLES löschen" — Memory + Stimmen + Settings weg; `.env` + SSH-Keys bleiben
|
||||
|
||||
---
|
||||
|
||||
## Voice Bridge
|
||||
|
||||
Die Bridge verbindet die Android App mit ARIA und bietet lokale Sprachverarbeitung.
|
||||
Die Bridge verbindet die Android App mit ARIA und orchestriert die GPU-Services
|
||||
auf der Gamebox.
|
||||
|
||||
**Nachrichtenfluss:**
|
||||
```
|
||||
Text: App → RVS → Bridge → chat.send → aria-core
|
||||
Audio: App → RVS → Bridge → FFmpeg → Whisper STT → chat.send → aria-core
|
||||
Datei: App → RVS → Bridge → /shared/uploads/ → chat.send (mit Pfad) → aria-core
|
||||
Text: App → RVS → Bridge → aria-brain (HTTP)
|
||||
Audio: App → RVS → Bridge → stt_request (RVS) → whisper-bridge (Gamebox)
|
||||
→ stt_response → Bridge → aria-brain
|
||||
Fallback bei Timeout: lokales faster-whisper (CPU)
|
||||
Datei: App → RVS → Bridge → /shared/uploads/ → aria-brain (mit Pfad)
|
||||
|
||||
aria-core → Antwort → Gateway → Diagnostic → RVS → App
|
||||
→ Bridge → Piper TTS → RVS → App (Audio)
|
||||
→ Bridge → Lautsprecher (lokal)
|
||||
aria-brain → Antwort → Bridge → RVS → App
|
||||
→ xtts_request (RVS) → f5tts-bridge
|
||||
→ audio_pcm Stream → RVS → App AudioTrack
|
||||
```
|
||||
|
||||
### Features
|
||||
|
||||
- **STT**: faster-whisper (lokal, offline, 16kHz mono)
|
||||
- **TTS**: Piper (Ramona + Thorsten, offline) oder XTTS v2 (remote, GPU, Voice Cloning)
|
||||
- **Markdown-Bereinigung**: Entfernt **fett**, *kursiv*, `code`, Links, Listen etc. vor TTS (natuerliche Sprache)
|
||||
- **Wake-Word**: openwakeword (lokales Mikrofon auf der VM)
|
||||
- **App-Audio**: Base64 Audio von App → FFmpeg → Whisper STT → Text an aria-core
|
||||
- **STT primaer remote**: aria-bridge sendet `stt_request` an die Gamebox-Whisper
|
||||
(faster-whisper CUDA, fast Echtzeit). 45s Timeout, dann Fallback auf lokales
|
||||
CPU-Whisper. Modell-Wahl in Diagnostic, Hot-Swap via config-Broadcast.
|
||||
- **TTS via F5-TTS**: aria-f5tts-bridge auf der Gamebox. Voice Cloning mit
|
||||
Referenz-Audio + automatisch transkribiertem Referenz-Text.
|
||||
- **Text-Cleanup**: `<voice>...</voice>` Tag bevorzugt; Markdown, Code,
|
||||
Einheiten und URLs werden TTS-gerecht aufbereitet. Dezimalzahlen werden
|
||||
ausgeschrieben (`0,1` → "null komma eins"). Acronyme bis 5 Buchstaben werden
|
||||
buchstabiert (`USB` → "U S B", `XTTS` → "X T T S").
|
||||
- **Wake-Word**: openwakeword (lokales Mikrofon auf der VM, optional)
|
||||
- **Modi**: Normal, Nicht stoeren, Fluestern, Hangar, Gaming
|
||||
|
||||
### Betriebsmodi
|
||||
@@ -322,43 +302,27 @@ aria-core → Antwort → Gateway → Diagnostic → RVS → App
|
||||
| Hangar | `"ARIA, ich arbeite"` | Nur wichtige Meldungen |
|
||||
| Gaming | `"ARIA, Gaming-Modus"` | Nur auf direkte Fragen antworten |
|
||||
|
||||
### Stimmen
|
||||
|
||||
| Stimme | Modell | Wann |
|
||||
|--------|--------|------|
|
||||
| **Ramona** (weiblich) | `de_DE-ramona-low` | Alltag, Antworten, Gespraeche |
|
||||
| **Thorsten** (maennlich, tief) | `de_DE-thorsten-high` | Epische Momente, Alarme |
|
||||
|
||||
---
|
||||
|
||||
## Diagnostic — Selbstcheck-UI und Einstellungen
|
||||
|
||||
Erreichbar unter `http://<VM-IP>:3001`. Teilt das Netzwerk mit aria-core.
|
||||
Erreichbar unter `http://<VM-IP>:3001`. Teilt das Netzwerk mit der Bridge.
|
||||
|
||||
### Features
|
||||
### Tabs
|
||||
|
||||
- **Status-Karten**: Gateway (Handshake), RVS (TLS-Fallback), Proxy (Auth)
|
||||
- **Chat-Test**: Nachrichten direkt an ARIA senden (Gateway oder via RVS), Vollbild-Modus
|
||||
- **"ARIA denkt..." Indikator**: Zeigt live was ARIA gerade tut (Denken, Tool, Schreiben)
|
||||
- **Abbrechen-Button**: Stoppt laufende Anfragen + doctor --fix
|
||||
- **Session-Verwaltung**: Sessions auflisten, wechseln, erstellen, loeschen, als Markdown exportieren (⬇ Button)
|
||||
- **Chat-History**: Wird beim Laden und Session-Wechsel angezeigt (read-only aus JSONL)
|
||||
- **TTS-Diagnose Tab**: Stimmen testen, Status pruefen, Fehler anzeigen
|
||||
- **Einstellungen**: TTS-Engine (Piper/XTTS), Stimmen, Speed, Highlight-Trigger, Betriebsmodi, Whisper-Modell (tiny…large-v3, Hot-Reload)
|
||||
- **XTTS Voice Cloning**: Audio-Samples hochladen, eigene Stimme erstellen
|
||||
- **Main**: Brain/RVS/Proxy-Status, Chat-Test, "ARIA denkt..."-Indikator, End-to-End-Trace, Container-Logs
|
||||
- **Gehirn**: Memory-Browser (Vector-DB), Suche + Filter, Edit/Add/Delete, Gehirn-Export/Import (tar.gz), Skills (geplant)
|
||||
- **Dateien**: Browser fuer `/shared/uploads/` — von ARIA generierte oder hochgeladene Dateien herunterladen oder loeschen (Live-Update der Chat-Bubbles)
|
||||
- **Einstellungen**: Reparatur (Container-Restart fuer Brain/Bridge/Qdrant), Komplett-Reset, Betriebsmodi, Sprachausgabe + Voice-Cloning + F5-TTS-Tuning, Whisper, Onboarding-QR, App-Cleanup
|
||||
|
||||
### Was zusaetzlich noch drin steckt
|
||||
|
||||
- **Disk-Voll Banner** mit copy-baren Cleanup-Befehlen (safe + aggressiv)
|
||||
- **Voice Cloning**: Audio-Samples hochladen, Whisper transkribiert den Ref-Text automatisch
|
||||
- **Voice Export/Import**: einzelne Stimmen als `.tar.gz` zwischen Gameboxen mitnehmen
|
||||
- **Settings Export/Import**: `voice_config.json` + `highlight_triggers.json` als JSON-Bundle
|
||||
- **Claude Login**: Browser-Terminal zum Einloggen in den Proxy
|
||||
- **Core Terminal**: Shell in aria-core (openclaw CLI)
|
||||
- **Container-Logs**: Echtzeit-Logs aller Container (gefiltert nach Tab + Pipeline)
|
||||
- **SSH Terminal**: Direkter SSH-Zugang zu aria-wohnung
|
||||
- **Watchdog**: Erkennt stuck Runs (2min Warnung → 5min doctor --fix → 8min Container-Restart)
|
||||
|
||||
### Session-Verwaltung
|
||||
|
||||
Die in der Diagnostic gewaehlte Session gilt **global** — Bridge und App nutzen
|
||||
dieselbe Session. Die aktive Session wird unter `/data/active-session` persistiert
|
||||
und ueberlebt Container-Restarts.
|
||||
|
||||
API-Endpoint fuer andere Services: `GET http://localhost:3001/api/session`
|
||||
- **SSH Terminal**: direkter SSH-Zugang zu aria-wohnung
|
||||
|
||||
---
|
||||
|
||||
@@ -367,22 +331,72 @@ API-Endpoint fuer andere Services: `GET http://localhost:3001/api/session`
|
||||
### Features
|
||||
|
||||
- Text-Chat mit ARIA
|
||||
- **Sprachaufnahme**: Push-to-Talk (halten) oder Tap-to-Talk (tippen, Auto-Stop bei Stille)
|
||||
- **Gespraechsmodus** (Ohr-Button): Nach jeder ARIA-Antwort startet automatisch die Aufnahme — wie ein natuerliches Gespraech hin und her, ohne Buttons druecken
|
||||
- **VAD (Voice Activity Detection)**: Erkennt 1.8s Stille und stoppt automatisch
|
||||
- **Speech Gate**: Aufnahme wird verworfen wenn keine Sprache erkannt (kein Rauschen an Whisper)
|
||||
- **STT (Speech-to-Text)**: Audio wird als 16kHz mono aufgenommen und in der Bridge per Whisper transkribiert, transkribierter Text erscheint im Chat
|
||||
- **Sprachaufnahme**: Tap-to-Talk (tippen startet, tippen stoppt, Auto-Stop bei Stille via VAD)
|
||||
- **Gespraechsmodus** (Ohr-Button): Nach jeder ARIA-Antwort startet automatisch die Aufnahme — wie ein natuerliches Gespraech hin und her
|
||||
- **Wake-Word** (on-device, openWakeWord ONNX): "Hey Jarvis", "Alexa", "Hey Mycroft", "Hey Rhasspy" — Mikrofon hoert passiv mit, Konversation startet beim Schluesselwort. Komplett on-device via ONNX Runtime, kein API-Key, kein Cloud-Roundtrip, Audio verlaesst das Geraet nicht.
|
||||
- **VAD (Voice Activity Detection)**: Adaptive Schwelle (Baseline aus ersten 500ms Mic-Pegel + 6dB Offset). Konfigurierbare Stille-Toleranz (1.0–8.0s, Default 2.8s) bevor Auto-Stop greift. Max-Aufnahme einstellbar (1–30 min, Default 5 min)
|
||||
- **Barge-In**: Wenn du waehrend ARIAs Antwort eine neue Sprach-/Text-Nachricht reinschickst, wird sie unterbrochen + bekommt den Hint "das ist eine Korrektur"
|
||||
- **Wake-Word waehrend TTS**: Du kannst "Computer" sagen waehrend ARIA noch redet — AcousticEchoCanceler verhindert dass ARIAs eigene Stimme das Wake-Word triggert
|
||||
- **Anruf-Pause + Auto-Resume**: TTS verstummt bei klassischem Anruf oder VoIP-Call (WhatsApp/Signal/Discord). Nach dem Auflegen geht ARIA von der **genauen Stelle** weiter wo sie unterbrochen wurde — die App misst die Position vom Wiedergabe-Anfang und nutzt den WAV-Cache der Antwort
|
||||
- **Speech Gate**: Aufnahme wird verworfen wenn keine Sprache erkannt
|
||||
- **STT (Speech-to-Text)**: 16kHz mono → Bridge → Gamebox-Whisper (CUDA) → Text im Chat. Fast in Echtzeit.
|
||||
- **"ARIA denkt..." Indicator**: Zeigt live den Status vom Core (Denken, Tool, Schreiben) + Abbrechen-Button
|
||||
- **TTS-Wiedergabe**: ARIA antwortet per Lautsprecher (Piper oder XTTS v2), Audio-Queue mit Preloading
|
||||
- **Play-Button**: Jede ARIA-Nachricht kann nochmal vorgelesen werden
|
||||
- **TTS-Wiedergabe**: F5-TTS PCM-Streaming direkt in AudioTrack mit konfigurierbarem Pre-Roll-Buffer (1.0–6.0s, Default 3.5s) gegen Gaps bei Render-Pausen
|
||||
- **Audio-Pause**: Andere Apps (Spotify, YouTube etc.) pausieren komplett waehrend ARIA spricht und kommen erst wieder nach echtem Wiedergabe-Ende
|
||||
- **Lokale Voice-Wahl**: Pro Geraet eigene Stimme moeglich (in Settings). Diagnostic-Wechsel ueberschreibt alle App-Wahlen.
|
||||
- **Voice-Ready Toast**: Beim Wechsel zeigt die App "Stimme X bereit (X.Ys)" sobald der Preload durch ist
|
||||
- **Play-Button**: Jede ARIA-Nachricht kann nochmal vorgelesen werden (aus Cache wenn vorhanden, sonst neu rendern)
|
||||
- **Chat-Suche**: Lupe in der Statusleiste filtert Nachrichten live
|
||||
- **Mehrere Anhaenge**: Bilder + Dateien sammeln, Text hinzufuegen, dann zusammen senden
|
||||
- **Paste-Support**: Bilder aus Zwischenablage einfuegen (Diagnostic)
|
||||
- **Anhaenge**: Bridge speichert in Shared Volume, ARIA kann darauf zugreifen, Re-Download ueber RVS
|
||||
- **Einstellungen**: TTS Engine, Stimmen, Speed pro Stimme, Speicherort, Auto-Download, GPS
|
||||
- **Einstellungen**: TTS-aktiv, F5-TTS-Voice, Pre-Roll-Buffer, Stille-Toleranz, Speicherort, Auto-Download, GPS, Verbose-Logging
|
||||
- **Auto-Update**: Prueft beim Start + per Button auf neue Version, Download + Installation ueber RVS (FileProvider)
|
||||
- GPS-Position (optional)
|
||||
- GPS-Position (optional, mit Runtime-Permission-Request) — wird in jeden Chat/Audio-Payload mitgegeben und ist in Diagnostic als Debug-Block einblendbar
|
||||
- QR-Code Scanner fuer Token-Pairing
|
||||
- **ARIA-Dateien empfangen**: Wenn ARIA eine PDF/Bild/Markdown/ZIP fuer dich erstellt (Marker `[FILE: /shared/uploads/aria_*]` in der Antwort), erscheint sie als eigene Anhang-Bubble. Tippen → wird via RVS geladen + mit Android-Intent-Picker geoeffnet (PDF-Viewer, Bildbetrachter, Standard-App). Inline-Bilder aus Markdown-``-Syntax werden direkt unter dem Text gerendert (PNG/JPG via Image, SVG via react-native-svg)
|
||||
- **Vollbild mit Pinch-Zoom**: Bilder im Vollbild-Modal sind pinch-zoombar (1x..5x), 1-Finger-Pan wenn gezoomt, Doppel-Tap toggelt 1x↔2.5x — alles ohne externe Lib
|
||||
- **Container-Restart-Buttons** (Settings → Reparatur): aria-bridge / aria-brain / aria-qdrant gezielt neu starten (jeweils ~5s Downtime). Geht ueber RVS → Bridge → Diagnostic → Docker-Socket-API.
|
||||
- **Cache-Cleanup**: Beim App-Start werden orphane TTS-WAVs aus dem Cache geraeumt. Plus Settings-Buttons "TTS-Cache leeren", "Update-Cache leeren", "Anhang-Cache leeren"
|
||||
|
||||
### Wake-Word (openWakeWord, on-device)
|
||||
|
||||
Wake-Word-Erkennung laeuft komplett **on-device** ueber [openWakeWord](https://github.com/dscripka/openWakeWord)
|
||||
mit ONNX Runtime — kein API-Key, kein Cloud-Roundtrip, kein Cent Lizenzgebuehren,
|
||||
und das Audio verlaesst das Geraet nie.
|
||||
|
||||
**Mitgelieferte Wake-Words** (ONNX-Dateien in `android/android/app/src/main/assets/openwakeword/`):
|
||||
- `Hey Jarvis` (Default, openWakeWord-Original)
|
||||
- `Computer` (Star-Trek-Style, Community-Modell)
|
||||
- `Alexa`, `Hey Mycroft`, `Hey Rhasspy` (openWakeWord-Originale)
|
||||
|
||||
Community-Modelle stammen aus [fwartner/home-assistant-wakewords-collection](https://github.com/fwartner/home-assistant-wakewords-collection).
|
||||
|
||||
**Bedienung:**
|
||||
- App → **Einstellungen** → **Wake-Word** → gewuenschtes Keyword waehlen → **Speichern + Aktivieren**
|
||||
- **Ohr-Button (👂)** in der Statusleiste tippen → Wake-Word ist scharf, App hoert passiv mit
|
||||
- Wake-Word sagen → Symbol wechselt auf 🎙️, **Bereit-Sound** (Ding-Dong, optional in Settings) + Toast "🎤 sprich jetzt" sobald das Mikro wirklich offen ist
|
||||
- Nach jeder ARIA-Antwort oeffnet sich das Mikro nochmal — Stille → zurueck zu 👂
|
||||
- Erneut tippen → Ohr aus (🔇)
|
||||
|
||||
**Eigene Wake-Words trainieren** (gratis, ~30 Min):
|
||||
|
||||
1. openWakeWord Trainings-Notebook auf Colab oeffnen (Link im
|
||||
[openWakeWord Repo](https://github.com/dscripka/openWakeWord) unter "Training Custom Models")
|
||||
2. Wake-Word-Phrase eingeben (z.B. "ARIA", "Hey Stefan"), Notebook ausfuehren —
|
||||
das Notebook generiert synthetische Trainings-Beispiele und trainiert das Modell.
|
||||
3. Resultierende `.onnx`-Datei runterladen
|
||||
4. Datei in `android/android/app/src/main/assets/openwakeword/` ablegen
|
||||
5. In `android/src/services/wakeword.ts` den Dateinamen (ohne `.onnx`) zur
|
||||
`WAKE_KEYWORDS`-Liste hinzufuegen
|
||||
6. APK neu bauen
|
||||
|
||||
*(Diagnostic-Upload fuer Custom-`.onnx` ohne Rebuild kommt spaeter.)*
|
||||
|
||||
**Tuning** (in [wakeword.ts](android/src/services/wakeword.ts)):
|
||||
- `DEFAULT_THRESHOLD = 0.5` — Score-Schwelle (raise auf 0.6–0.7 bei False-Positives)
|
||||
- `DEFAULT_PATIENCE = 2` — wie viele Frames ueber Threshold noetig
|
||||
- `DEFAULT_DEBOUNCE_MS = 1500` — Mindestabstand zwischen zwei Triggern
|
||||
|
||||
### Ersteinrichtung (Dev-Maschine, einmalig)
|
||||
|
||||
@@ -429,7 +443,7 @@ RVS_UPDATE_HOST=root@aria-rvs # Optional: fuer Auto-Update
|
||||
### Docker-Cleanup
|
||||
|
||||
Das Bridge-Image zieht grosse ML-Deps (faster-whisper, ctranslate2, onnxruntime,
|
||||
openwakeword, piper-tts) — bei jedem Rebuild waechst der Docker-Build-Cache. Wenn
|
||||
openwakeword) — bei jedem Rebuild waechst der Docker-Build-Cache. Wenn
|
||||
die VM voll laeuft:
|
||||
|
||||
```bash
|
||||
@@ -451,18 +465,44 @@ Der Update-Flow:
|
||||
|
||||
```
|
||||
App (Mikrofon) → AAC/MP4 Aufnahme → Base64 → RVS → Bridge
|
||||
Bridge: FFmpeg (16kHz PCM) → Whisper STT → Text → aria-core
|
||||
Bridge: FFmpeg (16kHz PCM) → Whisper STT → Text → aria-brain
|
||||
Bridge: STT-Ergebnis → RVS → App (Placeholder wird durch transkribierten Text ersetzt)
|
||||
aria-core → Antwort → Bridge → Piper TTS (WAV) → Base64 → RVS → App
|
||||
App: Base64 → WAV → Lautsprecher
|
||||
aria-brain → Antwort → Bridge → F5-TTS (Gaming-PC) → PCM-Stream → RVS → App
|
||||
App: AudioTrack MODE_STREAM (nahtlos), Cache als WAV pro Message
|
||||
```
|
||||
|
||||
### Audio-Verhalten in der App
|
||||
|
||||
| Phase | Andere App (Spotify) | ARIA-Mikro |
|
||||
|------------------------------|----------------------|-------------------------|
|
||||
| Idle / Ohr aus | spielt frei | aus |
|
||||
| Wake-Word lauscht (armed) | spielt frei | passiv (openWakeWord) |
|
||||
| User-Aufnahme laeuft | pausiert (EXCLUSIVE) | Recording |
|
||||
| Aufnahme zu Ende | resumed | aus |
|
||||
| ARIA denkt/schreibt (~20s) | spielt frei | aus |
|
||||
| TTS startet | pausiert (DUCK) | aus (oder barge) |
|
||||
| TTS spielt (auch GPU-Pausen) | bleibt pausiert | barge wenn Wake-Word |
|
||||
| TTS zu Ende | nach 800ms resumed | (Conversation-Window) |
|
||||
| Eingehender Anruf (auch VoIP)| — | Mikro pausiert |
|
||||
| Anruf vorbei (Auto-Resume) | pausiert wieder | aus |
|
||||
| Neue Frage waehrend Anruf | — | (Resume verworfen) |
|
||||
|
||||
Mechanismen: Underrun-Schutz im PcmStreamPlayer (Stille-Fill in Render-
|
||||
Pausen), Conversation-Focus bei Wake-Word, Foreground-Service mit
|
||||
mediaPlayback|microphone, Anruf-Erkennung ueber TelephonyManager +
|
||||
AudioFocus-Loss-Listener mit Polling-Fallback (VoIP). Bei Anruf wird
|
||||
die Wiedergabe-Position gemerkt — nach dem Auflegen spielt ARIA ab
|
||||
der genauen Stelle weiter (oder verwirft das wenn der User waehrend
|
||||
des Telefonats per Text eine neue Frage gestellt hat). PcmPlayback-
|
||||
Finished-Event vom Native sorgt dafuer dass Spotify erst pausiert
|
||||
bleibt bis ARIA wirklich verstummt ist.
|
||||
|
||||
### Datei-Pipeline (Bilder & Anhaenge)
|
||||
|
||||
```
|
||||
App (Kamera/Dateimanager) → Base64 → RVS → Bridge
|
||||
Bridge: Speichert in /shared/uploads/ (Shared Volume, fuer aria-core sichtbar)
|
||||
Bridge: chat.send → "Stefan hat ein Bild geschickt: foto.jpg — liegt unter /shared/uploads/..."
|
||||
Bridge: Speichert in /shared/uploads/ (Shared Volume, fuer aria-brain sichtbar)
|
||||
Bridge: aria-brain → "Stefan hat ein Bild geschickt: foto.jpg — liegt unter /shared/uploads/..."
|
||||
ARIA: Kann Datei per Bash/Read-Tool oeffnen und analysieren
|
||||
```
|
||||
|
||||
@@ -492,39 +532,34 @@ ist in den App-Einstellungen konfigurierbar.
|
||||
|
||||
## Datenverzeichnis — aria-data/
|
||||
|
||||
Alles was ARIA weiss, kann und ist — liegt hier. Ein `tar` = vollstaendiges Backup.
|
||||
|
||||
```
|
||||
aria-data/
|
||||
├── brain/ ← ARIAs Gedaechtnis (OpenClaw Memory)
|
||||
│ ├── MEMORY.md ← Langzeitgedaechtnis
|
||||
│ └── memory/ ← Tageslogbuecher
|
||||
├── brain/ ← ARIAs Gehirn — Bind-Mount, GITIGNORED
|
||||
│ ├── qdrant/ ← Vector-DB Storage (Memories, Skills-Embeddings)
|
||||
│ └── data/ ← Skills, Embedding-Modell-Cache
|
||||
│ └── skills/<name>/ ← Pro Skill ein Ordner mit Manifest, Code, venv
|
||||
│
|
||||
├── skills/ ← ARIAs Faehigkeiten (selbst geschrieben!)
|
||||
│
|
||||
├── voices/ ← Piper TTS Stimmen (offline)
|
||||
│ ├── de_DE-ramona-low.onnx
|
||||
│ └── de_DE-thorsten-high.onnx
|
||||
├── brain-import/ ← Quell-Dateien fuer den initialen Import in die DB
|
||||
│ ├── AGENT.md ← Persoenlichkeit (wird Memory-Punkte vom Typ identity/rule)
|
||||
│ ├── BOOTSTRAP.md
|
||||
│ ├── TOOLING.md.example
|
||||
│ └── USER.md.example
|
||||
│
|
||||
├── config/
|
||||
│ ├── BOOTSTRAP.md ← System-Prompt (Identitaet, Regeln, Tools)
|
||||
│ ├── AGENT.md ← Persoenlichkeit & Arbeitsprinzipien
|
||||
│ ├── USER.md ← Stefans Praeferenzen
|
||||
│ ├── openclaw.env ← OpenClaw Environment
|
||||
│ ├── aria.env ← Voice Bridge Config
|
||||
│ └── diag-state/ ← Diagnostic persistenter State
|
||||
│
|
||||
│ (im Shared Volume /shared/config/):
|
||||
│ ├── voice_config.json ← TTS-Einstellungen (Stimme, Speed, Engine)
|
||||
│ ├── highlight_triggers.json ← Highlight-Trigger Woerter
|
||||
│ └── chat_backup.jsonl ← Nachrichten-Backup (on-the-fly)
|
||||
│
|
||||
└── ssh/ ← SSH Keys fuer VM-Zugriff
|
||||
├── id_ed25519 ← Private Key (generiert von aria-setup.sh)
|
||||
├── id_ed25519.pub ← Public Key (muss in VM authorized_keys!)
|
||||
└── config ← SSH Config (Host aria-wohnung)
|
||||
└── ssh/ ← SSH Keys (Brain + Proxy teilen sich)
|
||||
├── id_ed25519
|
||||
├── id_ed25519.pub
|
||||
└── config ← Host aria-wohnung
|
||||
```
|
||||
|
||||
`aria-data/brain/` (Vector-DB + Skills) ist gitignored — Backup laeuft ueber
|
||||
den Gehirn-Export-Button in der Diagnostic, nicht ueber Git.
|
||||
|
||||
Settings im Shared Volume (`/shared/config/`): `voice_config.json`,
|
||||
`highlight_triggers.json`, `runtime.json`, `chat_backup.jsonl`.
|
||||
|
||||
**Backup:**
|
||||
```bash
|
||||
tar -czf aria-backup-$(date +%Y%m%d).tar.gz aria-data/
|
||||
@@ -558,7 +593,7 @@ cp ARIA-v0.0.3.0.apk ~/ARIA-AGENT/rvs/updates/
|
||||
|
||||
---
|
||||
|
||||
## XTTS v2 — GPU TTS Server (optional)
|
||||
## Gamebox-Stack — F5-TTS + Whisper (GPU-Services)
|
||||
|
||||
Laeuft auf einem separaten Rechner mit NVIDIA GPU (z.B. Gaming-PC mit RTX 3060).
|
||||
Verbindet sich ueber RVS mit der ARIA-Infrastruktur — kein VPN noetig, funktioniert
|
||||
@@ -567,22 +602,27 @@ ueber verschiedene Netze hinweg.
|
||||
### Architektur
|
||||
|
||||
```
|
||||
Gaming-PC (Windows, RTX 3060, Docker Desktop + WSL2)
|
||||
├── aria-xtts XTTS v2 GPU Server (Port 8020 intern)
|
||||
└── aria-xtts-bridge RVS-Relay (empfaengt Requests, sendet Audio)
|
||||
└── Beide teilen ./voices/ Volume fuer Voice Cloning
|
||||
Gamebox (Windows, RTX 3060, Docker Desktop + WSL2)
|
||||
├── aria-f5tts-bridge F5-TTS Voice Cloning + RVS-Relay
|
||||
│ Hoert auf xtts_request, streamt audio_pcm
|
||||
├── aria-whisper-bridge faster-whisper auf CUDA (float16)
|
||||
│ Hoert auf stt_request, antwortet mit stt_response
|
||||
└── ./voices/ Geteilt zwischen beiden:
|
||||
{name}.wav — Referenz-Audio (~6-10s)
|
||||
{name}.txt — Referenz-Text (auto via Whisper)
|
||||
|
||||
↕ RVS (Rechenzentrum, WebSocket Relay)
|
||||
|
||||
ARIA-VM
|
||||
└── aria-bridge: tts_engine="xtts" → xtts_request via RVS → wartet auf xtts_response
|
||||
└── aria-bridge: STT primaer remote (45s Timeout, dann lokaler CPU-Fallback)
|
||||
TTS via xtts_request → audio_pcm Stream
|
||||
```
|
||||
|
||||
### Voraussetzungen
|
||||
|
||||
- Docker Desktop mit WSL2 (Windows) oder Docker mit NVIDIA Runtime (Linux)
|
||||
- NVIDIA Container Toolkit
|
||||
- GPU mit mindestens 4GB VRAM (6GB+ empfohlen)
|
||||
- GPU mit mindestens 6GB VRAM (Whisper-large + F5-TTS gemeinsam)
|
||||
- **Gleicher RVS_TOKEN wie auf der ARIA-VM!**
|
||||
|
||||
### Setup
|
||||
@@ -592,52 +632,89 @@ cd xtts
|
||||
cp .env.example .env
|
||||
# .env mit RVS-Verbindungsdaten fuellen (gleicher Token wie ARIA-VM!)
|
||||
docker compose up -d
|
||||
# Erster Start laedt ~2GB Model herunter (danach gecacht)
|
||||
# Erster Start laedt die Modelle (Whisper ~1-3GB je nach Groesse, F5-TTS ~1GB)
|
||||
```
|
||||
|
||||
**Wichtig:** Der XTTS-Server laeuft intern auf Port **8020** (nicht 8000).
|
||||
Das Model wird im Volume `xtts-models` gecacht und muss nur einmal geladen werden.
|
||||
Die Modelle werden in den Volumes `f5tts-models` und `whisper-models` gecacht
|
||||
und muessen nur einmal geladen werden.
|
||||
|
||||
### Features
|
||||
|
||||
- **Natuerliche Stimmen**: Deutlich bessere Qualitaet als Piper
|
||||
- **Voice Cloning**: Eigene Stimme mit 6-10s Audio-Sample (~2s Latenz auf RTX 3060)
|
||||
- **16 Sprachen**: Deutsch, Englisch, Franzoesisch, etc.
|
||||
- **Fallback**: Wenn XTTS nicht erreichbar, nutzt die Bridge automatisch Piper
|
||||
**F5-TTS (Sprachausgabe):**
|
||||
- Hochqualitatives Voice Cloning auf Basis von 6-10s Referenz-Audio
|
||||
- Renderzeit ~0.3x Realtime auf RTX 3060 (RTF ≈ 0.3)
|
||||
- Satzweises Streaming, fade-in auf erstem Chunk gegen Warmup-Glitches
|
||||
- Sequentielle Queue gegen GPU-OOM bei parallelen Requests
|
||||
|
||||
### TTS-Engine umschalten
|
||||
**Whisper (Spracherkennung):**
|
||||
- faster-whisper mit CUDA + float16 — fast Echtzeit-Transkription
|
||||
- Modelle: tiny / base / small / medium / large-v3 (Hot-Swap via Diagnostic)
|
||||
- Wird zusaetzlich von der f5tts-bridge intern genutzt um den Referenz-Text
|
||||
beim Voice-Upload automatisch zu erzeugen
|
||||
|
||||
### TTS-Config
|
||||
|
||||
In der Diagnostic unter Einstellungen → Sprachausgabe:
|
||||
- **TTS aktiv**: Global An/Aus
|
||||
- **TTS Engine**: Piper (lokal, CPU, schnell) oder XTTS v2 (remote, GPU, natuerlich)
|
||||
- **Piper**: Standard-Stimme, Highlight-Stimme, Speed pro Stimme
|
||||
- **XTTS**: Stimmen-Auswahl, Voice Cloning
|
||||
- **F5-TTS Stimme**: Default oder gecloned (Maia etc.)
|
||||
|
||||
> F5-TTS ist die einzige Engine — wenn die Gamebox offline ist, bleibt ARIA stumm.
|
||||
> Chat-Antworten kommen weiter an (nur kein Audio).
|
||||
|
||||
### Stimme klonen
|
||||
|
||||
1. TTS Engine auf "XTTS v2" stellen
|
||||
2. "Stimme klonen" → Audio-Dateien hochladen (WAV/MP3, 1-10 Dateien, min. 6-10s gesamt)
|
||||
3. Name vergeben → "Stimme erstellen"
|
||||
4. "Laden" klicken → neue Stimme in der Auswahl
|
||||
5. Stimme auswaehlen → Config wird automatisch gespeichert
|
||||
1. App oder Diagnostic → "Stimme klonen" → Audio-Dateien hochladen
|
||||
(WAV/MP3, 1-10 Dateien, ~6-10s gesamt)
|
||||
2. Name vergeben → "Stimme erstellen"
|
||||
3. f5tts-bridge speichert das WAV, schickt einen `stt_request` an die
|
||||
whisper-bridge, legt die Transkription als `.txt` daneben ab und meldet
|
||||
`xtts_voice_saved` zurueck. Der Toast in der App zeigt "Stimme bereit".
|
||||
4. Stimme auswaehlen → ein Voice-Preload (stiller Mini-Render) waermt die
|
||||
Latents auf, "voice_ready" Toast bestaetigt es.
|
||||
|
||||
> **Tipp:** Fuer beste Ergebnisse: saubere Aufnahme, eine Stimme, kein Hintergrund,
|
||||
> 10-30 Sekunden Gesamtlaenge. Mehrere kurze Dateien werden zusammengefuegt.
|
||||
|
||||
### Deutsches Fine-Tune (bessere Qualitaet auf Deutsch)
|
||||
|
||||
Das Default-Modell `F5TTS_v1_Base` ist primaer auf Englisch + Chinesisch trainiert
|
||||
und liefert auf Deutsch merklich schwaechere Voice-Cloning-Qualitaet als XTTS es
|
||||
tat. Community-Fine-Tune von [aihpi](https://huggingface.co/aihpi/F5-TTS-German)
|
||||
auf dem Emilia-Dataset + Common Voice 19.0 funktioniert deutlich besser.
|
||||
|
||||
**Konfiguration ueber Diagnostic → "F5-TTS Modell-Tuning (advanced)":**
|
||||
|
||||
| Feld | Wert |
|
||||
|------|------|
|
||||
| Modell-Architektur | `F5TTS_Base` *(nicht v1_Base! Fine-Tune basiert auf der alten Architektur)* |
|
||||
| Custom Checkpoint | `hf://aihpi/F5-TTS-German/F5TTS_Base/model_365000.safetensors` |
|
||||
| Custom Vocab | `hf://aihpi/F5-TTS-German/vocab.txt` |
|
||||
| cfg_strength | `2.0` |
|
||||
| nfe_step | `32` |
|
||||
|
||||
→ "Anwenden" klicken. Die `hf://`-Pfade werden einmalig automatisch runter-
|
||||
geladen (~3-5GB, landet im `xtts/hf-cache/`) und bei Container-Restart aus
|
||||
dem Cache wiederverwendet.
|
||||
|
||||
> **Warnung zur BigVGAN-Variante** (`F5TTS_Base_bigvgan/model_295000.safetensors`):
|
||||
> funktioniert AKTUELL NICHT mit dieser Bridge. Die f5-tts Library laedt
|
||||
> per Default den Vocos-Vocoder, die BigVGAN-Weights sind damit inkompatibel
|
||||
> → Modell produziert NaN, App bleibt stumm. Nur die **Vocos-Variante
|
||||
> (F5TTS_Base/model_365000.safetensors)** nutzen.
|
||||
|
||||
---
|
||||
|
||||
## Docker Volumes
|
||||
|
||||
| Volume | Pfad im Container | Zweck |
|
||||
|--------|-------------------|-------|
|
||||
| `openclaw-config` | `/home/node/.openclaw` | OpenClaw Config, Sessions, Auth |
|
||||
| `claude-config` | `/home/node/.claude` | Claude Code Settings, Permissions |
|
||||
| `~/.claude` (bind) | `/root/.claude` (Proxy) | Claude CLI Credentials |
|
||||
| `./aria-data/ssh` (bind) | `/root/.ssh`, `/home/node/.ssh` | SSH Keys |
|
||||
| `./aria-data/brain` (bind) | `/home/node/.openclaw/workspace/memory` | Gedaechtnis |
|
||||
| `./aria-data/skills` (bind) | `/home/node/.openclaw/workspace/skills` | Skills |
|
||||
| `aria-shared` | `/shared` (Core + Bridge + Proxy + Diag) | Datei-Austausch, Config, Uploads |
|
||||
| `./aria-data/config/diag-state` (bind) | `/data` (Diagnostic) | Persistenter State (aktive Session) |
|
||||
| Volume / Bind | Pfad im Container | Zweck |
|
||||
|---------------|-------------------|-------|
|
||||
| `~/.claude` (bind) | `/root/.claude` (proxy) | Claude CLI Credentials |
|
||||
| `./aria-data/ssh` (bind) | `/root/.ssh` (proxy, brain) | SSH-Keys fuer aria-wohnung |
|
||||
| `./aria-data/brain/qdrant` (bind) | `/qdrant/storage` (qdrant) | Vector-DB Storage |
|
||||
| `./aria-data/brain/data` (bind) | `/data` (brain) | Skills + Embedding-Modell-Cache |
|
||||
| `./aria-data/brain` (bind) | `/brain` (diagnostic) | Brain-Export/Import-Endpoints |
|
||||
| `aria-shared` | `/shared` (brain, bridge, proxy, diagnostic) | Datei-Austausch, Config, Uploads |
|
||||
| `./aria-data/config/diag-state` (bind) | `/data` (diagnostic) | Diagnostic persistenter State |
|
||||
|
||||
---
|
||||
|
||||
@@ -666,22 +743,21 @@ docker compose down
|
||||
|
||||
# Einzelnen Container neu bauen
|
||||
docker compose up -d --build diagnostic
|
||||
docker compose up -d --build bridge
|
||||
docker compose up -d --build bridge brain
|
||||
|
||||
# Logs
|
||||
docker compose logs -f # alle
|
||||
docker compose logs -f aria # nur aria-core
|
||||
docker compose logs -f proxy # nur proxy
|
||||
docker compose logs -f # alle
|
||||
docker compose logs -f brain # nur Agent + Memory
|
||||
docker compose logs -f qdrant # nur Vector-DB
|
||||
docker compose logs -f bridge # nur Voice-Bridge
|
||||
docker compose logs -f proxy # nur Claude-Proxy
|
||||
|
||||
# Setup wiederholen (nach Config-Aenderungen)
|
||||
./aria-setup.sh
|
||||
# SSH-Test (Brain zu aria-wohnung)
|
||||
docker exec aria-brain ssh aria-wohnung hostname
|
||||
|
||||
# SSH-Test
|
||||
docker exec aria-core ssh aria-wohnung hostname
|
||||
|
||||
# Tool-Test
|
||||
# Neue Session in Diagnostic anlegen, dann:
|
||||
# "Wie wird das Wetter in Bremen?"
|
||||
# Brain-API direkt testen
|
||||
docker exec aria-brain curl localhost:8080/health
|
||||
docker exec aria-brain curl localhost:8080/memory/stats
|
||||
```
|
||||
|
||||
---
|
||||
@@ -691,8 +767,10 @@ docker exec aria-core ssh aria-wohnung hostname
|
||||
- **Proxy Cold Start**: Jede Nachricht spawnt einen neuen `claude --print` Prozess.
|
||||
Dadurch ist ARIA langsamer als die direkte Claude CLI. Timeout ist auf 900s (15 Min).
|
||||
- **Kein Streaming zur App**: Die App zeigt erst die fertige Antwort, keine Streaming-Tokens.
|
||||
- **Wake Word nur auf VM**: Die Bridge hoert auf "ARIA" ueber das lokale Mikrofon der VM.
|
||||
In der App gibt es Energy-basierte Erkennung (Phase 1). On-device "ARIA"-Keyword (Porcupine) ist Phase 2.
|
||||
- **Wake-Word in der App nur eingebaute Keywords**: `Hey Jarvis`, `Alexa`, `Hey Mycroft`,
|
||||
`Hey Rhasspy` funktionieren sofort, eigene Wake-Words muessen aktuell noch als
|
||||
`.onnx`-Datei ins App-Bundle gelegt + zur Liste in `wakeword.ts` hinzugefuegt werden.
|
||||
Die Diagnostic-Upload-UI ist Phase 2.
|
||||
- **Audio-Format**: App nimmt AAC/MP4 auf, Bridge konvertiert via FFmpeg zu 16kHz PCM.
|
||||
- **RVS Zombie-Connections**: WebSocket-Verbindungen sterben gelegentlich ohne Fehlermeldung.
|
||||
Bridge hat Ping-Check (5s), Diagnostic nutzt frische Verbindungen pro Request.
|
||||
@@ -718,7 +796,9 @@ docker exec aria-core ssh aria-wohnung hostname
|
||||
- [x] SSH-Zugriff auf VM (aria-wohnung)
|
||||
- [x] Diagnostic Web-UI + Einstellungen
|
||||
- [x] Session-Verwaltung + Chat-History
|
||||
- [x] Stimmen-Einstellungen (Ramona/Thorsten, Speed, Highlight-Trigger)
|
||||
- [x] Stimmen-Einstellungen (frueher Piper Ramona/Thorsten, Highlight-Trigger) — durch XTTS, dann F5-TTS Voice Cloning ersetzt
|
||||
- [x] Piper komplett entfernt — nur noch XTTS v2 als TTS (Gaming-PC)
|
||||
- [x] Streaming TTS: PCM-Chunks direkt in AudioTrack, nahtlose Wiedergabe
|
||||
- [x] TTS satzweise fuer lange Texte
|
||||
- [x] Datei-/Bild-Upload mit Shared Volume
|
||||
- [x] Watchdog (stuck Run Erkennung + Auto-Fix + Container-Restart)
|
||||
@@ -736,6 +816,42 @@ docker exec aria-core ssh aria-wohnung hostname
|
||||
- [x] "ARIA denkt..."-Indicator + Abbrechen-Button in App (via Bridge → RVS)
|
||||
- [x] Whisper-Modell waehlbar in Diagnostic (tiny…large-v3, Hot-Reload)
|
||||
- [x] App-Aufnahme explizit 16kHz mono (optimal fuer Whisper, kein Resample)
|
||||
- [x] Streaming TTS Pre-Roll-Buffer + Wartezeit auf playbackHeadPosition (kein Cutoff mid-Satz mehr)
|
||||
- [x] Pre-Roll-Buffer einstellbar in App-Settings
|
||||
- [x] Decimal-zu-Worte fuer TTS + generisches Acronym-Buchstabieren
|
||||
- [x] voice_preload/voice_ready: visueller Status-Indikator beim Stimmen-Wechsel
|
||||
- [x] Whisper STT auf die Gamebox ausgelagert (CUDA float16, fast Echtzeit)
|
||||
- [x] **F5-TTS ersetzt XTTS** — bessere Voice-Cloning-Qualitaet, Whisper-auto-transkribierter Referenz-Text
|
||||
- [x] Audio-Pause statt Ducking (TRANSIENT statt MAY_DUCK) + release-Timing fix
|
||||
- [x] VAD-Stille-Toleranz einstellbar (1-8s) + adaptive Mikro-Baseline + Max-Aufnahme einstellbar (1-30 min)
|
||||
- [x] Barge-In: User kann ARIA waehrend Antwort unterbrechen, aria-core bekommt Kontext-Hint
|
||||
- [x] Anruf-Pause + Auto-Resume: TTS verstummt bei Anruf, faehrt nach Auflegen ab der gemerkten Position fort (Date.now()-Tracking + WAV-Cache der Antwort)
|
||||
- [x] PcmPlaybackFinished-Event: AudioFocus wird erst released wenn AudioTrack wirklich durch ist — kein Spotify-mid-TTS mehr
|
||||
- [x] Edge-Case: neue Frage waehrend Telefonat verwirft pending Auto-Resume, neueste Antwort gewinnt
|
||||
- [x] Settings-Sub-Screens: 8 Kategorien statt langer Liste
|
||||
- [x] APK ABI-Split arm64-v8a: 35 MB statt 136 MB
|
||||
- [x] Sprachnachrichten-Bubble: audioRequestId statt Substring-Match — keine vertauschten Bubbles mehr bei parallelen Aufnahmen
|
||||
- [x] Bereit-Sound (Airplane Ding-Dong) wenn Mikro nach Wake-Word offen ist — akustische Bestaetigung, in Settings abschaltbar
|
||||
- [x] Wake-Word parallel zu TTS mit AcousticEchoCanceler — "Computer" sagen waehrend ARIA spricht stoppt sie und oeffnet Mikro
|
||||
- [x] GPS-Position mit Nachrichten mitsenden (Toggle in Settings) — ARIA nutzt sie nur bei standortbezogenen Fragen, im Chat sichtbar nur in ihrer Antwort
|
||||
- [x] Sprachnachrichten ohne STT-Result werden nach Timeout automatisch entfernt (skaliert mit Aufnahmedauer)
|
||||
- [x] Background Audio Service: TTS, Wake-Word-Lauschen + Aufnahme laufen auch bei minimierter App weiter (Foreground-Service mit mediaPlayback|microphone, dynamische Notification)
|
||||
- [x] Disk-Voll Banner in Diagnostic mit copy-baren Cleanup-Befehlen
|
||||
- [x] Wake-Word on-device via openWakeWord (ONNX Runtime, kein API-Key) + State-Icon
|
||||
|
||||
### Phase A — Refactor: OpenClaw raus, eigenes Brain rein
|
||||
|
||||
- [x] aria-brain Container-Skeleton (FastAPI, Qdrant, sentence-transformers)
|
||||
- [x] Diagnostic: Gehirn-Tab (Memory Search/Filter, Add/Edit/Delete)
|
||||
- [x] Diagnostic: Gehirn-Export/Import als tar.gz
|
||||
- [x] Diagnostic: Datei-Manager (Liste, Suche, Download, Delete mit Live-Bubble-Update)
|
||||
- [x] App: Datei-Manager als Modal in den Einstellungen
|
||||
- [x] Diagnostic: Komplett-Reset (Wipe All)
|
||||
- [x] Voice Export/Import (einzelne Stimmen + F5/Whisper-Settings als Bundle)
|
||||
- [x] aria-core (OpenClaw) komplett abgerissen — Tag `v0.1.2.0` als Archiv
|
||||
- [ ] **Phase B Punkt 2:** Migration `aria-data/brain-import/` → atomare Memory-Punkte
|
||||
- [ ] **Phase B Punkt 3:** Brain Conversation-Loop (Single-Chat + Rolling Window + Memory-Destillat)
|
||||
- [ ] **Phase B Punkt 4:** Skills-System (Manifest, venv, README pro Skill, Diagnostic-Tab)
|
||||
|
||||
### Phase 2 — ARIA wird produktiv
|
||||
|
||||
@@ -744,12 +860,11 @@ docker exec aria-core ssh aria-wohnung hostname
|
||||
- [ ] VM einrichten (Desktop, Browser, Tools)
|
||||
- [ ] Heartbeat (periodische Selbst-Checks)
|
||||
- [ ] Lokales LLM als Waechter (Triage vor Claude-Call)
|
||||
- [ ] Auto-Compacting / Memory-Verwaltung
|
||||
|
||||
### Phase 3 — Erweiterungen
|
||||
|
||||
- [ ] STARFACE Telefonie-Skill
|
||||
- [ ] Desktop Client (Tauri)
|
||||
- [ ] bKVM Remote IT-Support
|
||||
- [ ] Porcupine Wake Word (on-device "ARIA" in der App)
|
||||
- [ ] Custom-`.onnx`-Upload fuer Wake-Word ueber Diagnostic (ohne App-Rebuild)
|
||||
- [ ] Claude Vision direkt (Bildanalyse ohne Dateipfad-Umweg)
|
||||
|
||||
@@ -13,6 +13,7 @@ import { createBottomTabNavigator } from '@react-navigation/bottom-tabs';
|
||||
import ChatScreen from './src/screens/ChatScreen';
|
||||
import SettingsScreen from './src/screens/SettingsScreen';
|
||||
import rvs from './src/services/rvs';
|
||||
import { initLogger } from './src/services/logger';
|
||||
|
||||
// --- Navigation ---
|
||||
|
||||
@@ -44,6 +45,10 @@ const TAB_ICONS: Record<string, { active: string; inactive: string }> = {
|
||||
const App: React.FC = () => {
|
||||
// Beim Start: gespeicherte RVS-Konfiguration laden und verbinden
|
||||
useEffect(() => {
|
||||
// Verbose-Logging-Setting laden BEVOR andere Module loslegen.
|
||||
// initLogger ist async aber blockt nichts — solange er noch laueft,
|
||||
// loggen wir normal (Default an), danach respektiert console.log das Setting.
|
||||
initLogger().catch(() => {});
|
||||
const initConnection = async () => {
|
||||
const config = await rvs.loadConfig();
|
||||
if (config) {
|
||||
|
||||
@@ -79,8 +79,8 @@ android {
|
||||
applicationId "com.ariacockpit"
|
||||
minSdkVersion rootProject.ext.minSdkVersion
|
||||
targetSdkVersion rootProject.ext.targetSdkVersion
|
||||
versionCode 309
|
||||
versionName "0.0.3.9"
|
||||
versionCode 10202
|
||||
versionName "0.1.2.2"
|
||||
// Fallback fuer Libraries mit Product Flavors
|
||||
missingDimensionStrategy 'react-native-camera', 'general'
|
||||
}
|
||||
@@ -104,6 +104,19 @@ android {
|
||||
proguardFiles getDefaultProguardFile("proguard-android.txt"), "proguard-rules.pro"
|
||||
}
|
||||
}
|
||||
|
||||
// ABI-Split: nur arm64-v8a (jedes Android-Phone seit ~2017). Bringt die
|
||||
// APK von ~136 MB auf ~35 MB — relevant weil ONNX Runtime + die anderen
|
||||
// Native-Libs sonst pro Architektur dazukommen. Wer 32-bit oder Emulator
|
||||
// braucht, kann hier "armeabi-v7a", "x86_64" etc. ergaenzen.
|
||||
splits {
|
||||
abi {
|
||||
enable true
|
||||
reset()
|
||||
include "arm64-v8a"
|
||||
universalApk false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
@@ -111,6 +124,9 @@ dependencies {
|
||||
implementation("com.facebook.react:react-android")
|
||||
implementation("com.facebook.react:flipper-integration")
|
||||
|
||||
// ONNX Runtime fuer on-device Wake-Word (openWakeWord ONNX-Modelle in assets/openwakeword/)
|
||||
implementation("com.microsoft.onnxruntime:onnxruntime-android:1.17.1")
|
||||
|
||||
if (hermesEnabled.toBoolean()) {
|
||||
implementation("com.facebook.react:hermes-android")
|
||||
} else {
|
||||
|
||||
@@ -4,6 +4,19 @@
|
||||
<uses-permission android:name="android.permission.CAMERA" />
|
||||
<uses-permission android:name="android.permission.RECORD_AUDIO" />
|
||||
<uses-permission android:name="android.permission.REQUEST_INSTALL_PACKAGES" />
|
||||
<!-- Anruf-State lesen damit TTS bei klingelndem Telefon pausiert -->
|
||||
<uses-permission android:name="android.permission.READ_PHONE_STATE" />
|
||||
<!-- Optional: GPS-Position der Frage anhaengen (nur wenn User in Settings aktiviert) -->
|
||||
<uses-permission android:name="android.permission.ACCESS_COARSE_LOCATION" />
|
||||
<uses-permission android:name="android.permission.ACCESS_FINE_LOCATION" />
|
||||
<!-- Foreground-Service damit TTS auch bei minimierter App weiterlaeuft.
|
||||
FOREGROUND_SERVICE_MICROPHONE ist Pflicht ab Android 14 wenn der
|
||||
Service waehrend des Backgrounds aufs Mikro zugreift (Wake-Word,
|
||||
Aufnahme im Gespraechsmodus). -->
|
||||
<uses-permission android:name="android.permission.FOREGROUND_SERVICE" />
|
||||
<uses-permission android:name="android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK" />
|
||||
<uses-permission android:name="android.permission.FOREGROUND_SERVICE_MICROPHONE" />
|
||||
<uses-permission android:name="android.permission.POST_NOTIFICATIONS" />
|
||||
|
||||
<application
|
||||
android:name=".MainApplication"
|
||||
@@ -35,5 +48,10 @@
|
||||
android:name="android.support.FILE_PROVIDER_PATHS"
|
||||
android:resource="@xml/file_paths" />
|
||||
</provider>
|
||||
|
||||
<service
|
||||
android:name=".AriaPlaybackService"
|
||||
android:exported="false"
|
||||
android:foregroundServiceType="mediaPlayback|microphone" />
|
||||
</application>
|
||||
</manifest>
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -7,7 +7,7 @@ import com.facebook.react.uimanager.ViewManager
|
||||
|
||||
class ApkInstallerPackage : ReactPackage {
|
||||
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
|
||||
return listOf(ApkInstallerModule(reactContext))
|
||||
return listOf(ApkInstallerModule(reactContext), FileOpenerModule(reactContext))
|
||||
}
|
||||
|
||||
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
|
||||
|
||||
@@ -0,0 +1,108 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import android.app.Notification
|
||||
import android.app.NotificationChannel
|
||||
import android.app.NotificationManager
|
||||
import android.app.PendingIntent
|
||||
import android.app.Service
|
||||
import android.content.Intent
|
||||
import android.os.Build
|
||||
import android.os.IBinder
|
||||
import android.util.Log
|
||||
import androidx.core.app.NotificationCompat
|
||||
|
||||
/**
|
||||
* Foreground-Service der den App-Prozess waehrend TTS-Wiedergabe am Leben
|
||||
* haelt — Android killt sonst den Prozess sobald die App im Hintergrund ist
|
||||
* und ARIA verstummt mitten im Satz.
|
||||
*
|
||||
* Notification ist persistent (ongoing) waehrend der Service laeuft.
|
||||
* Tap auf die Notification bringt MainActivity zurueck nach vorne.
|
||||
*
|
||||
* foregroundServiceType="mediaPlayback" ist Pflicht ab Android 14, sonst
|
||||
* wirft startForeground() eine SecurityException.
|
||||
*/
|
||||
class AriaPlaybackService : Service() {
|
||||
companion object {
|
||||
private const val TAG = "AriaPlaybackService"
|
||||
private const val CHANNEL_ID = "aria_playback"
|
||||
private const val NOTIFICATION_ID = 1042
|
||||
const val EXTRA_REASON = "reason" // "tts" | "wake" | "rec" | ""
|
||||
}
|
||||
|
||||
private var currentReason: String = ""
|
||||
|
||||
override fun onCreate() {
|
||||
super.onCreate()
|
||||
ensureNotificationChannel()
|
||||
}
|
||||
|
||||
override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int {
|
||||
val reason = intent?.getStringExtra(EXTRA_REASON) ?: ""
|
||||
currentReason = reason
|
||||
Log.i(TAG, "Foreground-Service start/update (reason=$reason)")
|
||||
try {
|
||||
startForeground(NOTIFICATION_ID, buildNotification(reason))
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "startForeground fehlgeschlagen", e)
|
||||
stopSelf()
|
||||
}
|
||||
// START_NOT_STICKY: wenn Android den Service killt, NICHT automatisch
|
||||
// wieder starten — die App entscheidet wann der Service noetig ist.
|
||||
return START_NOT_STICKY
|
||||
}
|
||||
|
||||
override fun onDestroy() {
|
||||
Log.i(TAG, "Foreground-Service gestoppt")
|
||||
super.onDestroy()
|
||||
}
|
||||
|
||||
override fun onBind(intent: Intent?): IBinder? = null
|
||||
|
||||
private fun ensureNotificationChannel() {
|
||||
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
|
||||
val nm = getSystemService(NotificationManager::class.java) ?: return
|
||||
if (nm.getNotificationChannel(CHANNEL_ID) == null) {
|
||||
val channel = NotificationChannel(
|
||||
CHANNEL_ID,
|
||||
"ARIA Audio-Wiedergabe",
|
||||
NotificationManager.IMPORTANCE_LOW,
|
||||
).apply {
|
||||
description = "Notification waehrend ARIA spricht (haelt die App im Hintergrund am Leben)"
|
||||
setShowBadge(false)
|
||||
}
|
||||
nm.createNotificationChannel(channel)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun buildNotification(reason: String): Notification {
|
||||
val launchIntent = Intent(this, MainActivity::class.java).apply {
|
||||
flags = Intent.FLAG_ACTIVITY_NEW_TASK or Intent.FLAG_ACTIVITY_CLEAR_TOP
|
||||
}
|
||||
val pendingFlags = if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M)
|
||||
PendingIntent.FLAG_IMMUTABLE or PendingIntent.FLAG_UPDATE_CURRENT
|
||||
else
|
||||
PendingIntent.FLAG_UPDATE_CURRENT
|
||||
val pendingIntent = PendingIntent.getActivity(this, 0, launchIntent, pendingFlags)
|
||||
|
||||
val (title, body) = when (reason) {
|
||||
"tts" -> "ARIA spricht" to "Antwort wird abgespielt — antippen oeffnet die App"
|
||||
"rec" -> "ARIA hoert zu" to "Sprachaufnahme laeuft — antippen oeffnet die App"
|
||||
"wake" -> "ARIA bereit" to "Wake-Word lauscht passiv — antippen oeffnet die App"
|
||||
else -> "ARIA aktiv" to "Hintergrund-Modus — antippen oeffnet die App"
|
||||
}
|
||||
|
||||
return NotificationCompat.Builder(this, CHANNEL_ID)
|
||||
.setContentTitle(title)
|
||||
.setContentText(body)
|
||||
.setSmallIcon(R.mipmap.ic_launcher)
|
||||
.setContentIntent(pendingIntent)
|
||||
.setOngoing(true)
|
||||
.setShowWhen(false)
|
||||
.setPriority(NotificationCompat.PRIORITY_LOW)
|
||||
.setCategory(NotificationCompat.CATEGORY_SERVICE)
|
||||
.setVisibility(NotificationCompat.VISIBILITY_PUBLIC)
|
||||
.build()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,212 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import android.content.Context
|
||||
import android.media.AudioAttributes
|
||||
import android.media.AudioFocusRequest
|
||||
import android.media.AudioManager
|
||||
import android.os.Build
|
||||
import android.util.Log
|
||||
import com.facebook.react.bridge.Arguments
|
||||
import com.facebook.react.bridge.Promise
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.bridge.ReactContextBaseJavaModule
|
||||
import com.facebook.react.bridge.ReactMethod
|
||||
import com.facebook.react.modules.core.DeviceEventManagerModule
|
||||
|
||||
/**
|
||||
* Steuert Audio-Focus fuer Ducking/Muten anderer Apps + emittiert Loss-Events
|
||||
* an JS damit ARIA bei VoIP-Anrufen (WhatsApp/Signal/Discord/...) aufhoert
|
||||
* zu sprechen — diese Anrufe gehen nicht ueber TelephonyManager, sondern
|
||||
* requestn AudioFocus_GAIN_TRANSIENT_EXCLUSIVE was wir hier mitbekommen.
|
||||
*
|
||||
* - requestDuck() → andere Apps werden leiser (ARIA spricht TTS)
|
||||
* - requestExclusive() → andere Apps werden pausiert (Mikrofon-Aufnahme)
|
||||
* - release() → Focus abgeben, andere Apps duerfen wieder
|
||||
*
|
||||
* Events:
|
||||
* - "AudioFocusChanged" mit type:
|
||||
* "loss" — endgueltiger Verlust (Anruf, andere App permanent)
|
||||
* "loss_transient" — vorruebergehender Verlust (kurze Unterbrechung)
|
||||
* "gain" — Fokus zurueck
|
||||
*/
|
||||
class AudioFocusModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||
override fun getName() = "AudioFocus"
|
||||
|
||||
companion object { private const val TAG = "AudioFocus" }
|
||||
|
||||
private var currentRequest: AudioFocusRequest? = null
|
||||
|
||||
private fun audioManager(): AudioManager? =
|
||||
reactApplicationContext.getSystemService(Context.AUDIO_SERVICE) as? AudioManager
|
||||
|
||||
private fun emitFocusChange(type: String) {
|
||||
try {
|
||||
val params = Arguments.createMap().apply { putString("type", type) }
|
||||
reactApplicationContext.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
|
||||
.emit("AudioFocusChanged", params)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "emit failed: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
private val focusListener = AudioManager.OnAudioFocusChangeListener { focusChange ->
|
||||
when (focusChange) {
|
||||
AudioManager.AUDIOFOCUS_LOSS -> {
|
||||
Log.i(TAG, "AUDIOFOCUS_LOSS (z.B. Anruf, anderer Player permanent)")
|
||||
emitFocusChange("loss")
|
||||
}
|
||||
AudioManager.AUDIOFOCUS_LOSS_TRANSIENT -> {
|
||||
Log.i(TAG, "AUDIOFOCUS_LOSS_TRANSIENT (kurze Unterbrechung)")
|
||||
emitFocusChange("loss_transient")
|
||||
}
|
||||
AudioManager.AUDIOFOCUS_LOSS_TRANSIENT_CAN_DUCK -> {
|
||||
// Notification-Sound o.ae. — wir ignorieren das, ARIA macht weiter
|
||||
Log.d(TAG, "AUDIOFOCUS_LOSS_CAN_DUCK ignoriert")
|
||||
}
|
||||
AudioManager.AUDIOFOCUS_GAIN -> {
|
||||
Log.i(TAG, "AUDIOFOCUS_GAIN")
|
||||
emitFocusChange("gain")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun requestFocus(durationHint: Int, usage: Int, promise: Promise) {
|
||||
val am = audioManager()
|
||||
if (am == null) {
|
||||
promise.reject("NO_AUDIO_MANAGER", "AudioManager nicht verfuegbar")
|
||||
return
|
||||
}
|
||||
|
||||
release()
|
||||
|
||||
val result: Int = if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
|
||||
val attrs = AudioAttributes.Builder()
|
||||
.setUsage(usage)
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.build()
|
||||
val req = AudioFocusRequest.Builder(durationHint)
|
||||
.setAudioAttributes(attrs)
|
||||
.setOnAudioFocusChangeListener(focusListener)
|
||||
.build()
|
||||
currentRequest = req
|
||||
am.requestAudioFocus(req)
|
||||
} else {
|
||||
@Suppress("DEPRECATION")
|
||||
am.requestAudioFocus(focusListener, AudioManager.STREAM_MUSIC, durationHint)
|
||||
}
|
||||
|
||||
promise.resolve(result == AudioManager.AUDIOFOCUS_REQUEST_GRANTED)
|
||||
}
|
||||
|
||||
/** Andere Apps werden pausiert (TTS spricht).
|
||||
*
|
||||
* TRANSIENT (statt TRANSIENT_MAY_DUCK): Spotify/YouTube pausieren komplett
|
||||
* statt nur leiser zu werden. Verhindert auch das "kommt-wieder-hoch"-
|
||||
* Problem mit MAY_DUCK, wo das System nach kurzer Zeit den Duck-Effekt
|
||||
* wieder aufgehoben hat obwohl wir den Fokus noch hielten.
|
||||
*/
|
||||
@ReactMethod
|
||||
fun requestDuck(promise: Promise) {
|
||||
requestFocus(
|
||||
AudioManager.AUDIOFOCUS_GAIN_TRANSIENT,
|
||||
AudioAttributes.USAGE_ASSISTANT,
|
||||
promise,
|
||||
)
|
||||
}
|
||||
|
||||
/** Andere Apps werden pausiert (Mikrofon-Aufnahme / Gespraech). */
|
||||
@ReactMethod
|
||||
fun requestExclusive(promise: Promise) {
|
||||
requestFocus(
|
||||
AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_EXCLUSIVE,
|
||||
AudioAttributes.USAGE_VOICE_COMMUNICATION,
|
||||
promise,
|
||||
)
|
||||
}
|
||||
|
||||
/** Focus abgeben — andere Apps duerfen wieder volle Lautstaerke. */
|
||||
@ReactMethod
|
||||
fun release(promise: Promise) {
|
||||
release()
|
||||
promise.resolve(true)
|
||||
}
|
||||
|
||||
/** Den USAGE_MEDIA-Focus-Stack im System aufmischen, damit Spotify/YouTube
|
||||
* resumen wenn ein anderer Player (z.B. react-native-sound) seinen Focus
|
||||
* nicht ordnungsgemaess released hat. Strategie: kurz selbst USAGE_MEDIA
|
||||
* GAIN beanspruchen — das System invalidiert dabei den haengenden Stack-
|
||||
* Eintrag des anderen Players — und sofort wieder abandonen. Spotify
|
||||
* bekommt den Focus-Gain und resumed.
|
||||
*
|
||||
* Workaround fuer das react-native-sound-Bug: Sound.stop()/release()
|
||||
* laesst den AudioFocusRequest haengen.
|
||||
*/
|
||||
@ReactMethod
|
||||
fun kickReleaseMedia(promise: Promise) {
|
||||
val am = audioManager()
|
||||
if (am == null) {
|
||||
promise.resolve(false)
|
||||
return
|
||||
}
|
||||
// Async laufen lassen — wir wollen einen request, Pause, dann abandon.
|
||||
// Ohne Pause merkt das System (und damit Spotify) die kurze Owner-
|
||||
// Wechsel oft gar nicht. 250ms reicht erfahrungsgemaess fuer den
|
||||
// Focus-Stack-Refresh.
|
||||
Thread {
|
||||
try {
|
||||
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
|
||||
val attrs = AudioAttributes.Builder()
|
||||
.setUsage(AudioAttributes.USAGE_MEDIA)
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_MUSIC)
|
||||
.build()
|
||||
val kickListener = AudioManager.OnAudioFocusChangeListener { /* ignorieren */ }
|
||||
val kickReq = AudioFocusRequest.Builder(AudioManager.AUDIOFOCUS_GAIN)
|
||||
.setAudioAttributes(attrs)
|
||||
.setOnAudioFocusChangeListener(kickListener)
|
||||
.build()
|
||||
am.requestAudioFocus(kickReq)
|
||||
Thread.sleep(250)
|
||||
am.abandonAudioFocusRequest(kickReq)
|
||||
} else {
|
||||
val kickListener = AudioManager.OnAudioFocusChangeListener { /* ignorieren */ }
|
||||
@Suppress("DEPRECATION")
|
||||
am.requestAudioFocus(kickListener, AudioManager.STREAM_MUSIC, AudioManager.AUDIOFOCUS_GAIN)
|
||||
Thread.sleep(250)
|
||||
@Suppress("DEPRECATION")
|
||||
am.abandonAudioFocus(kickListener)
|
||||
}
|
||||
Log.i(TAG, "kickReleaseMedia: USAGE_MEDIA-Stack aufgemischt (250ms Pause)")
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "kickReleaseMedia failed: ${e.message}")
|
||||
}
|
||||
}.start()
|
||||
promise.resolve(true)
|
||||
}
|
||||
|
||||
private fun release() {
|
||||
val am = audioManager() ?: return
|
||||
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
|
||||
currentRequest?.let { am.abandonAudioFocusRequest(it) }
|
||||
} else {
|
||||
@Suppress("DEPRECATION")
|
||||
am.abandonAudioFocus(focusListener)
|
||||
}
|
||||
currentRequest = null
|
||||
}
|
||||
|
||||
/** Aktueller Audio-Mode: NORMAL=0, IN_CALL=2, IN_COMMUNICATION=3, CALL_SCREENING=4.
|
||||
* IN_COMMUNICATION ist der typische VoIP-Anruf-Mode (WhatsApp, Signal, etc.) —
|
||||
* kann gepollt werden um zu erkennen wann der Anruf vorbei ist (zurueck NORMAL). */
|
||||
@ReactMethod
|
||||
fun getMode(promise: Promise) {
|
||||
val am = audioManager()
|
||||
if (am == null) {
|
||||
promise.resolve(0)
|
||||
return
|
||||
}
|
||||
promise.resolve(am.mode)
|
||||
}
|
||||
|
||||
@ReactMethod fun addListener(eventName: String) {}
|
||||
@ReactMethod fun removeListeners(count: Int) {}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import com.facebook.react.ReactPackage
|
||||
import com.facebook.react.bridge.NativeModule
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.uimanager.ViewManager
|
||||
|
||||
class AudioFocusPackage : ReactPackage {
|
||||
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
|
||||
return listOf(AudioFocusModule(reactContext))
|
||||
}
|
||||
|
||||
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
|
||||
return emptyList()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import android.content.Intent
|
||||
import android.os.Build
|
||||
import android.util.Log
|
||||
import com.facebook.react.bridge.Promise
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.bridge.ReactContextBaseJavaModule
|
||||
import com.facebook.react.bridge.ReactMethod
|
||||
|
||||
/**
|
||||
* RN-Bridge fuer den AriaPlaybackService.
|
||||
*
|
||||
* Wird vom JS waehrend einer TTS-Wiedergabe gestartet damit Android den
|
||||
* App-Prozess nicht killt wenn die App im Hintergrund ist (= ARIA spricht
|
||||
* weiter, auch wenn Stefan die App minimiert hat).
|
||||
*
|
||||
* Service stoppt entweder explizit per stop() oder wird von Android
|
||||
* mitgekillt wenn der Prozess weg ist (was bei Foreground-Service nur
|
||||
* passiert wenn der User die App force-stopped).
|
||||
*/
|
||||
class BackgroundAudioModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||
override fun getName() = "BackgroundAudio"
|
||||
|
||||
companion object { private const val TAG = "BackgroundAudio" }
|
||||
|
||||
@ReactMethod
|
||||
fun start(reason: String, promise: Promise) {
|
||||
try {
|
||||
val ctx = reactApplicationContext
|
||||
val intent = Intent(ctx, AriaPlaybackService::class.java)
|
||||
intent.putExtra(AriaPlaybackService.EXTRA_REASON, reason ?: "")
|
||||
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
|
||||
ctx.startForegroundService(intent)
|
||||
} else {
|
||||
ctx.startService(intent)
|
||||
}
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "start fehlgeschlagen: ${e.message}")
|
||||
promise.reject("START_FAILED", e.message ?: "Unbekannter Fehler", e)
|
||||
}
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun stop(promise: Promise) {
|
||||
try {
|
||||
val ctx = reactApplicationContext
|
||||
ctx.stopService(Intent(ctx, AriaPlaybackService::class.java))
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "stop fehlgeschlagen: ${e.message}")
|
||||
promise.reject("STOP_FAILED", e.message ?: "Unbekannter Fehler", e)
|
||||
}
|
||||
}
|
||||
|
||||
@ReactMethod fun addListener(eventName: String) {}
|
||||
@ReactMethod fun removeListeners(count: Int) {}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import com.facebook.react.ReactPackage
|
||||
import com.facebook.react.bridge.NativeModule
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.uimanager.ViewManager
|
||||
|
||||
class BackgroundAudioPackage : ReactPackage {
|
||||
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
|
||||
return listOf(BackgroundAudioModule(reactContext))
|
||||
}
|
||||
|
||||
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
|
||||
return emptyList()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import android.content.Intent
|
||||
import android.net.Uri
|
||||
import android.os.Build
|
||||
import androidx.core.content.FileProvider
|
||||
import com.facebook.react.bridge.Promise
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.bridge.ReactContextBaseJavaModule
|
||||
import com.facebook.react.bridge.ReactMethod
|
||||
import java.io.File
|
||||
|
||||
/**
|
||||
* Oeffnet eine beliebige Datei (PDF, Bild, Office-Doc, ...) mit der vom User
|
||||
* gewaehlten App via Android-Intent-Picker. Nutzt FileProvider damit auch
|
||||
* Android 7+ (content:// statt file://) das URI lesen darf.
|
||||
*
|
||||
* MIME-Type wird vom Caller bestimmt — App-Auswahl ist davon abhaengig (PDF
|
||||
* geht an PDF-Viewer, image/jpeg an Galerie, etc.).
|
||||
*/
|
||||
class FileOpenerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||
override fun getName() = "FileOpener"
|
||||
|
||||
@ReactMethod
|
||||
fun open(filePath: String, mimeType: String, promise: Promise) {
|
||||
try {
|
||||
val cleanPath = filePath.removePrefix("file://")
|
||||
val file = File(cleanPath)
|
||||
if (!file.exists()) {
|
||||
promise.reject("FILE_NOT_FOUND", "Datei nicht gefunden: $cleanPath")
|
||||
return
|
||||
}
|
||||
val context = reactApplicationContext
|
||||
val uri: Uri = if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.N) {
|
||||
FileProvider.getUriForFile(context, "${context.packageName}.fileprovider", file)
|
||||
} else {
|
||||
Uri.fromFile(file)
|
||||
}
|
||||
val safeMime = if (mimeType.isBlank()) "application/octet-stream" else mimeType
|
||||
val intent = Intent(Intent.ACTION_VIEW).apply {
|
||||
setDataAndType(uri, safeMime)
|
||||
addFlags(Intent.FLAG_ACTIVITY_NEW_TASK)
|
||||
addFlags(Intent.FLAG_GRANT_READ_URI_PERMISSION)
|
||||
}
|
||||
// Chooser zeigt Android-Auswahl falls mehrere Apps das MIME oeffnen koennen.
|
||||
val chooser = Intent.createChooser(intent, "Oeffnen mit").apply {
|
||||
addFlags(Intent.FLAG_ACTIVITY_NEW_TASK)
|
||||
}
|
||||
context.startActivity(chooser)
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
promise.reject("OPEN_ERROR", e.message, e)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -19,6 +19,11 @@ class MainApplication : Application(), ReactApplication {
|
||||
override fun getPackages(): List<ReactPackage> =
|
||||
PackageList(this).packages.apply {
|
||||
add(ApkInstallerPackage())
|
||||
add(AudioFocusPackage())
|
||||
add(PcmStreamPlayerPackage())
|
||||
add(OpenWakeWordPackage())
|
||||
add(PhoneCallPackage())
|
||||
add(BackgroundAudioPackage())
|
||||
}
|
||||
|
||||
override fun getJSMainModuleName(): String = "index"
|
||||
|
||||
@@ -0,0 +1,413 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import ai.onnxruntime.OnnxTensor
|
||||
import ai.onnxruntime.OrtEnvironment
|
||||
import ai.onnxruntime.OrtSession
|
||||
import android.Manifest
|
||||
import android.content.pm.PackageManager
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioRecord
|
||||
import android.media.MediaRecorder
|
||||
import android.media.audiofx.AcousticEchoCanceler
|
||||
import android.media.audiofx.AutomaticGainControl
|
||||
import android.media.audiofx.NoiseSuppressor
|
||||
import android.util.Log
|
||||
import androidx.core.content.ContextCompat
|
||||
import com.facebook.react.bridge.Promise
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.bridge.ReactContextBaseJavaModule
|
||||
import com.facebook.react.bridge.ReactMethod
|
||||
import com.facebook.react.modules.core.DeviceEventManagerModule
|
||||
import java.nio.FloatBuffer
|
||||
import java.util.concurrent.atomic.AtomicBoolean
|
||||
|
||||
/**
|
||||
* Wake-Word Erkennung on-device via openWakeWord (https://github.com/dscripka/openWakeWord).
|
||||
*
|
||||
* Drei-stufige ONNX Pipeline:
|
||||
* 1. Audio (16kHz mono int16, 1280-Sample-Chunks) → Melspectrogram → 32-mel Frames
|
||||
* 2. 76 Mel-Frames Sliding Window (stride 8) → Speech-Embedding → 96-dim Vektor
|
||||
* 3. Letzte 16 Embeddings (~1.28s Kontext) → Wake-Word-Klassifikator → Sigmoid-Score
|
||||
*
|
||||
* Modelle liegen in assets/openwakeword/ (mel + embedding shared, plus pro Keyword
|
||||
* ein eigenes .onnx). Erkennung feuert nach `patience` aufeinanderfolgenden
|
||||
* Frames ueber `threshold` und unterdrueckt Wiederholungen fuer `debounceMs`.
|
||||
*
|
||||
* Emittiert "WakeWordDetected" als RN-Event wenn ein Trigger erkannt wurde.
|
||||
*/
|
||||
class OpenWakeWordModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||
override fun getName() = "OpenWakeWord"
|
||||
|
||||
companion object {
|
||||
private const val TAG = "OpenWakeWord"
|
||||
private const val SAMPLE_RATE = 16000
|
||||
private const val CHUNK_SAMPLES = 1280 // 80ms @ 16kHz
|
||||
private const val MEL_FRAMES_PER_EMBEDDING = 76 // Embedding-Fenster
|
||||
private const val EMBEDDING_STRIDE = 8 // Slide um 8 Mel-Frames
|
||||
private const val EMBEDDING_DIM = 96
|
||||
private const val MEL_BINS = 32
|
||||
private const val DEFAULT_WW_INPUT_FRAMES = 16 // Fallback wenn Modell-Metadata fehlt
|
||||
}
|
||||
|
||||
private val env: OrtEnvironment = OrtEnvironment.getEnvironment()
|
||||
private var melSession: OrtSession? = null
|
||||
private var embSession: OrtSession? = null
|
||||
private var wwSession: OrtSession? = null
|
||||
|
||||
private var melInputName: String = "input"
|
||||
private var embInputName: String = "input_1"
|
||||
private var wwInputName: String = "input"
|
||||
// Anzahl Embedding-Frames die der Wake-Word-Klassifikator pro Inferenz erwartet —
|
||||
// hey_jarvis hat 16, andere Community-Modelle koennen abweichen (z.B. 28).
|
||||
// Wird beim init() aus den Modell-Metadaten gelesen.
|
||||
private var wwInputFrames: Int = DEFAULT_WW_INPUT_FRAMES
|
||||
|
||||
// Konfiguration
|
||||
private var threshold: Float = 0.5f
|
||||
private var patience: Int = 2
|
||||
private var debounceMs: Long = 1500
|
||||
private var modelName: String = "hey_jarvis"
|
||||
|
||||
// Audio-Capture-Thread
|
||||
private var audioRecord: AudioRecord? = null
|
||||
private val running = AtomicBoolean(false)
|
||||
private var captureThread: Thread? = null
|
||||
|
||||
// Audio-Effects: Echo-Cancellation (gegen ARIAs eigene TTS-Stimme die sonst
|
||||
// das Wake-Word triggern wuerde) + Noise-Suppression. Per VOICE_COMMUNICATION
|
||||
// Audio-Source schon vorhanden, aber explizites Aktivieren ist robuster.
|
||||
private var aec: AcousticEchoCanceler? = null
|
||||
private var ns: NoiseSuppressor? = null
|
||||
private var agc: AutomaticGainControl? = null
|
||||
|
||||
// Inferenz-State
|
||||
private val melBuffer: ArrayList<FloatArray> = ArrayList(256) // Liste von 32-dim Frames
|
||||
private var melProcessedIdx: Int = 0
|
||||
private val embBuffer: ArrayDeque<FloatArray> = ArrayDeque(32) // Ringpuffer letzter Embeddings
|
||||
private var consecutiveAboveThreshold: Int = 0
|
||||
private var lastDetectionMs: Long = 0L
|
||||
|
||||
/**
|
||||
* Initialisiert die ONNX-Sessions fuer ein bestimmtes Wake-Word.
|
||||
* modelName: dateiname ohne Suffix (z.B. "hey_jarvis", "alexa", "hey_mycroft", "hey_rhasspy")
|
||||
*/
|
||||
@ReactMethod
|
||||
fun init(modelName: String, threshold: Double, patience: Int, debounceMs: Int, promise: Promise) {
|
||||
try {
|
||||
disposeSessions()
|
||||
this.modelName = modelName
|
||||
this.threshold = threshold.toFloat()
|
||||
this.patience = patience.coerceAtLeast(1)
|
||||
this.debounceMs = debounceMs.toLong()
|
||||
|
||||
val ctx = reactApplicationContext
|
||||
val melBytes = ctx.assets.open("openwakeword/melspectrogram.onnx").use { it.readBytes() }
|
||||
val embBytes = ctx.assets.open("openwakeword/embedding_model.onnx").use { it.readBytes() }
|
||||
val wwBytes = ctx.assets.open("openwakeword/$modelName.onnx").use { it.readBytes() }
|
||||
|
||||
val opts = OrtSession.SessionOptions()
|
||||
melSession = env.createSession(melBytes, opts)
|
||||
embSession = env.createSession(embBytes, opts)
|
||||
wwSession = env.createSession(wwBytes, opts)
|
||||
|
||||
melInputName = melSession!!.inputNames.first()
|
||||
embInputName = embSession!!.inputNames.first()
|
||||
wwInputName = wwSession!!.inputNames.first()
|
||||
|
||||
// WW-Input-Frame-Count aus dem Modell lesen — variiert pro Keyword.
|
||||
// Erwartete Form: (1, N, 96), N steht in der Modell-Metadaten.
|
||||
val wwInputInfo = wwSession!!.inputInfo[wwInputName]
|
||||
val wwShape = (wwInputInfo?.info as? ai.onnxruntime.TensorInfo)?.shape
|
||||
wwInputFrames = wwShape?.getOrNull(1)?.toInt()?.takeIf { it > 0 } ?: DEFAULT_WW_INPUT_FRAMES
|
||||
|
||||
Log.i(TAG, "Init OK: model=$modelName wwFrames=$wwInputFrames threshold=$threshold patience=$patience " +
|
||||
"debounce=${debounceMs}ms (inputs: mel=$melInputName emb=$embInputName ww=$wwInputName)")
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Init fehlgeschlagen: ${e.message}", e)
|
||||
disposeSessions()
|
||||
promise.reject("INIT_FAILED", e.message ?: "Unbekannter Fehler", e)
|
||||
}
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun start(promise: Promise) {
|
||||
if (running.get()) {
|
||||
promise.resolve(true)
|
||||
return
|
||||
}
|
||||
if (melSession == null || embSession == null || wwSession == null) {
|
||||
promise.reject("NOT_INITIALIZED", "init() muss vor start() aufgerufen werden")
|
||||
return
|
||||
}
|
||||
// Berechtigung pruefen — der App-Code holt die ueblicherweise schon vorher,
|
||||
// aber wir bestehen hier explizit darauf damit AudioRecord nicht stumm
|
||||
// failt.
|
||||
val perm = ContextCompat.checkSelfPermission(reactApplicationContext, Manifest.permission.RECORD_AUDIO)
|
||||
if (perm != PackageManager.PERMISSION_GRANTED) {
|
||||
promise.reject("NO_MIC_PERMISSION", "RECORD_AUDIO Permission fehlt")
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
val minBuf = AudioRecord.getMinBufferSize(
|
||||
SAMPLE_RATE,
|
||||
AudioFormat.CHANNEL_IN_MONO,
|
||||
AudioFormat.ENCODING_PCM_16BIT,
|
||||
).coerceAtLeast(CHUNK_SAMPLES * 2 * 4)
|
||||
|
||||
// VOICE_COMMUNICATION-Source: aktiviert auf den meisten Android-Geraeten
|
||||
// automatisch Echo-Cancellation + Noise-Suppression. Wichtig damit
|
||||
// ARIAs eigene Stimme nicht das Wake-Word triggert wenn parallel
|
||||
// zur TTS-Wiedergabe gelauscht wird.
|
||||
val record = AudioRecord(
|
||||
MediaRecorder.AudioSource.VOICE_COMMUNICATION,
|
||||
SAMPLE_RATE,
|
||||
AudioFormat.CHANNEL_IN_MONO,
|
||||
AudioFormat.ENCODING_PCM_16BIT,
|
||||
minBuf,
|
||||
)
|
||||
if (record.state != AudioRecord.STATE_INITIALIZED) {
|
||||
record.release()
|
||||
promise.reject("AUDIO_INIT", "AudioRecord nicht initialisiert (Mikro belegt?)")
|
||||
return
|
||||
}
|
||||
audioRecord = record
|
||||
|
||||
// Audio-Effects ZUSAETZLICH explizit aktivieren — manche Geraete
|
||||
// benoetigen das, obwohl VOICE_COMMUNICATION es eigentlich schon
|
||||
// mitbringt. Failure ist nicht kritisch (continue ohne Effects).
|
||||
try {
|
||||
if (AcousticEchoCanceler.isAvailable()) {
|
||||
aec = AcousticEchoCanceler.create(record.audioSessionId)?.apply { enabled = true }
|
||||
Log.i(TAG, "AEC aktiviert (enabled=${aec?.enabled})")
|
||||
}
|
||||
} catch (e: Exception) { Log.w(TAG, "AEC failed: ${e.message}") }
|
||||
try {
|
||||
if (NoiseSuppressor.isAvailable()) {
|
||||
ns = NoiseSuppressor.create(record.audioSessionId)?.apply { enabled = true }
|
||||
}
|
||||
} catch (e: Exception) { Log.w(TAG, "NS failed: ${e.message}") }
|
||||
try {
|
||||
if (AutomaticGainControl.isAvailable()) {
|
||||
agc = AutomaticGainControl.create(record.audioSessionId)?.apply { enabled = true }
|
||||
}
|
||||
} catch (e: Exception) { Log.w(TAG, "AGC failed: ${e.message}") }
|
||||
|
||||
resetInferenceState()
|
||||
running.set(true)
|
||||
record.startRecording()
|
||||
|
||||
captureThread = Thread({ captureLoop() }, "OpenWakeWordCapture").apply {
|
||||
isDaemon = true
|
||||
start()
|
||||
}
|
||||
|
||||
Log.i(TAG, "Lauschen gestartet (model=$modelName)")
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "start fehlgeschlagen", e)
|
||||
running.set(false)
|
||||
audioRecord?.release()
|
||||
audioRecord = null
|
||||
promise.reject("START_FAILED", e.message ?: "Unbekannter Fehler", e)
|
||||
}
|
||||
}
|
||||
|
||||
private fun releaseAudioEffects() {
|
||||
try { aec?.release() } catch (_: Exception) {}
|
||||
try { ns?.release() } catch (_: Exception) {}
|
||||
try { agc?.release() } catch (_: Exception) {}
|
||||
aec = null; ns = null; agc = null
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun stop(promise: Promise) {
|
||||
running.set(false)
|
||||
try {
|
||||
captureThread?.join(1500)
|
||||
} catch (_: InterruptedException) {}
|
||||
captureThread = null
|
||||
try { audioRecord?.stop() } catch (_: Exception) {}
|
||||
try { audioRecord?.release() } catch (_: Exception) {}
|
||||
audioRecord = null
|
||||
releaseAudioEffects()
|
||||
Log.i(TAG, "Lauschen gestoppt")
|
||||
promise.resolve(true)
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun dispose(promise: Promise) {
|
||||
running.set(false)
|
||||
try { captureThread?.join(1000) } catch (_: InterruptedException) {}
|
||||
captureThread = null
|
||||
try { audioRecord?.stop() } catch (_: Exception) {}
|
||||
try { audioRecord?.release() } catch (_: Exception) {}
|
||||
audioRecord = null
|
||||
releaseAudioEffects()
|
||||
disposeSessions()
|
||||
promise.resolve(true)
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun isAvailable(promise: Promise) {
|
||||
// Wake-Word ist immer verfuegbar (kein API-Key, alles on-device)
|
||||
promise.resolve(true)
|
||||
}
|
||||
|
||||
// RN-Event-Subscriptions — RN-Konvention, sonst Warnung im Debug-Build
|
||||
@ReactMethod fun addListener(eventName: String) {}
|
||||
@ReactMethod fun removeListeners(count: Int) {}
|
||||
|
||||
private fun disposeSessions() {
|
||||
try { melSession?.close() } catch (_: Exception) {}
|
||||
try { embSession?.close() } catch (_: Exception) {}
|
||||
try { wwSession?.close() } catch (_: Exception) {}
|
||||
melSession = null
|
||||
embSession = null
|
||||
wwSession = null
|
||||
}
|
||||
|
||||
private fun resetInferenceState() {
|
||||
melBuffer.clear()
|
||||
melProcessedIdx = 0
|
||||
embBuffer.clear()
|
||||
consecutiveAboveThreshold = 0
|
||||
lastDetectionMs = 0L
|
||||
}
|
||||
|
||||
private fun emitDetected() {
|
||||
val params = com.facebook.react.bridge.Arguments.createMap().apply {
|
||||
putString("model", modelName)
|
||||
}
|
||||
try {
|
||||
reactApplicationContext
|
||||
.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
|
||||
.emit("WakeWordDetected", params)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "emit fehlgeschlagen: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
private fun captureLoop() {
|
||||
val buf = ShortArray(CHUNK_SAMPLES)
|
||||
val record = audioRecord ?: return
|
||||
Log.i(TAG, "Capture-Loop gestartet")
|
||||
while (running.get()) {
|
||||
var read = 0
|
||||
while (read < CHUNK_SAMPLES && running.get()) {
|
||||
val n = record.read(buf, read, CHUNK_SAMPLES - read)
|
||||
if (n <= 0) {
|
||||
Log.w(TAG, "AudioRecord.read returned $n — Loop ende")
|
||||
running.set(false)
|
||||
return
|
||||
}
|
||||
read += n
|
||||
}
|
||||
if (!running.get()) break
|
||||
try {
|
||||
processChunk(buf)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "processChunk: ${e.message}")
|
||||
}
|
||||
}
|
||||
Log.i(TAG, "Capture-Loop beendet")
|
||||
}
|
||||
|
||||
/** Verarbeitet einen 1280-Sample int16 Audio-Chunk. */
|
||||
private fun processChunk(audio: ShortArray) {
|
||||
// 1) Audio → mel (output (1, 1, frames, 32))
|
||||
val floats = FloatArray(audio.size) { audio[it].toFloat() }
|
||||
val melTensor = OnnxTensor.createTensor(
|
||||
env,
|
||||
FloatBuffer.wrap(floats),
|
||||
longArrayOf(1L, audio.size.toLong()),
|
||||
)
|
||||
val melResult = melSession!!.run(mapOf(melInputName to melTensor))
|
||||
val melOut = melResult.get(0).value
|
||||
melTensor.close()
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val mel4 = melOut as Array<Array<Array<FloatArray>>>
|
||||
val frames = mel4[0][0]
|
||||
// openWakeWord wendet `mel/10 + 2` an, bevor es ans Embedding-Modell geht
|
||||
for (frame in frames) {
|
||||
val scaled = FloatArray(frame.size) { frame[it] / 10f + 2f }
|
||||
melBuffer.add(scaled)
|
||||
}
|
||||
melResult.close()
|
||||
|
||||
// 2) Sliding window: alle vollstaendigen 76-Frame-Fenster verarbeiten
|
||||
while (melBuffer.size >= melProcessedIdx + MEL_FRAMES_PER_EMBEDDING) {
|
||||
val flat = FloatArray(MEL_FRAMES_PER_EMBEDDING * MEL_BINS)
|
||||
var pos = 0
|
||||
for (i in 0 until MEL_FRAMES_PER_EMBEDDING) {
|
||||
val src = melBuffer[melProcessedIdx + i]
|
||||
System.arraycopy(src, 0, flat, pos, MEL_BINS)
|
||||
pos += MEL_BINS
|
||||
}
|
||||
val embIn = OnnxTensor.createTensor(
|
||||
env,
|
||||
FloatBuffer.wrap(flat),
|
||||
longArrayOf(1L, MEL_FRAMES_PER_EMBEDDING.toLong(), MEL_BINS.toLong(), 1L),
|
||||
)
|
||||
val embRes = embSession!!.run(mapOf(embInputName to embIn))
|
||||
val embOut = embRes.get(0).value
|
||||
embIn.close()
|
||||
// Erwartete Output-Form: (1, 1, 1, 96) — rank-4, NICHT (1, 96).
|
||||
// Die Google-Embedding-Pipeline behaelt extra Dimensionen.
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val embArr = embOut as Array<Array<Array<FloatArray>>>
|
||||
embBuffer.addLast(embArr[0][0][0].copyOf())
|
||||
while (embBuffer.size > wwInputFrames) embBuffer.removeFirst()
|
||||
embRes.close()
|
||||
|
||||
melProcessedIdx += EMBEDDING_STRIDE
|
||||
}
|
||||
// Mel-Buffer trimmen — verhindert Memory-Wachstum
|
||||
if (melProcessedIdx > MEL_FRAMES_PER_EMBEDDING) {
|
||||
val keepFrom = melProcessedIdx - MEL_FRAMES_PER_EMBEDDING
|
||||
val newList = ArrayList<FloatArray>(melBuffer.size - keepFrom)
|
||||
for (i in keepFrom until melBuffer.size) newList.add(melBuffer[i])
|
||||
melBuffer.clear()
|
||||
melBuffer.addAll(newList)
|
||||
melProcessedIdx = MEL_FRAMES_PER_EMBEDDING
|
||||
}
|
||||
|
||||
// 3) Klassifikation — sobald wir 16 Embeddings haben
|
||||
if (embBuffer.size < wwInputFrames) return
|
||||
val flatEmb = FloatArray(wwInputFrames * EMBEDDING_DIM)
|
||||
var p = 0
|
||||
// Letzte wwInputFrames Embeddings nehmen (embBuffer ist auf wwInputFrames begrenzt)
|
||||
for (e in embBuffer) {
|
||||
System.arraycopy(e, 0, flatEmb, p, EMBEDDING_DIM)
|
||||
p += EMBEDDING_DIM
|
||||
}
|
||||
val wwIn = OnnxTensor.createTensor(
|
||||
env,
|
||||
FloatBuffer.wrap(flatEmb),
|
||||
longArrayOf(1L, wwInputFrames.toLong(), EMBEDDING_DIM.toLong()),
|
||||
)
|
||||
val wwRes = wwSession!!.run(mapOf(wwInputName to wwIn))
|
||||
val wwOut = wwRes.get(0).value
|
||||
wwIn.close()
|
||||
// Erwartete Output-Form: (1, 1) → Array<FloatArray>
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val score = (wwOut as Array<FloatArray>)[0][0]
|
||||
wwRes.close()
|
||||
|
||||
if (score >= threshold) {
|
||||
consecutiveAboveThreshold++
|
||||
if (consecutiveAboveThreshold >= patience) {
|
||||
val now = System.currentTimeMillis()
|
||||
if (now - lastDetectionMs >= debounceMs) {
|
||||
lastDetectionMs = now
|
||||
consecutiveAboveThreshold = 0
|
||||
Log.i(TAG, "Wake-Word erkannt! score=$score model=$modelName")
|
||||
emitDetected()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
consecutiveAboveThreshold = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import com.facebook.react.ReactPackage
|
||||
import com.facebook.react.bridge.NativeModule
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.uimanager.ViewManager
|
||||
|
||||
class OpenWakeWordPackage : ReactPackage {
|
||||
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
|
||||
return listOf(OpenWakeWordModule(reactContext))
|
||||
}
|
||||
|
||||
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
|
||||
return emptyList()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,374 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import android.media.AudioAttributes
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioManager
|
||||
import android.media.AudioTrack
|
||||
import android.os.Build
|
||||
import android.util.Base64
|
||||
import android.util.Log
|
||||
import com.facebook.react.bridge.Arguments
|
||||
import com.facebook.react.bridge.Promise
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.bridge.ReactContextBaseJavaModule
|
||||
import com.facebook.react.bridge.ReactMethod
|
||||
import com.facebook.react.modules.core.DeviceEventManagerModule
|
||||
import java.util.concurrent.LinkedBlockingQueue
|
||||
|
||||
/**
|
||||
* Streamt PCM-s16le Audio direkt via AudioTrack MODE_STREAM mit Pre-Roll.
|
||||
*
|
||||
* Pre-Roll: AudioTrack wird zwar direkt gebaut und gefuttert, aber play()
|
||||
* wird erst aufgerufen wenn PREROLL_SECONDS Audio im Buffer ist. So hat
|
||||
* der Stream Zeit einen Vorrat aufzubauen — wenn XTTS mit RTF>1 rendert
|
||||
* (langsamer als Echtzeit), laeuft der Buffer trotzdem nicht leer.
|
||||
*
|
||||
* Flow:
|
||||
* JS: start(sampleRate, channels) → öffnet AudioTrack (noch nicht play())
|
||||
* JS: writeChunk(base64) → dekodiert, queued, Writer schreibt
|
||||
* Writer: spielt los sobald PREROLL erreicht ist
|
||||
* JS: end() → wartet bis Queue leer, schließt
|
||||
* JS: stop() → Hart stoppen (Cancel)
|
||||
*/
|
||||
class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||
companion object {
|
||||
private const val TAG = "PcmStreamPlayer"
|
||||
// Fallback wenn JS keinen Wert uebergibt.
|
||||
private const val DEFAULT_PREROLL_SECONDS = 3.5
|
||||
// 0.0 = sofortige Wiedergabe — play() direkt beim ersten Chunk.
|
||||
// Macht Sinn fuer F5-TTS weil Render so schnell ist dass ein Puffer
|
||||
// unnoetig ist und bei kurzen Saetzen sogar stoeren kann.
|
||||
private const val MIN_PREROLL_SECONDS = 0.0
|
||||
private const val MAX_PREROLL_SECONDS = 10.0
|
||||
// Stille am Stream-Anfang, damit AudioTrack sauber anfaehrt und die
|
||||
// ersten Samples nicht abgeschnitten werden (XTTS-Warmup + play()-Latenz).
|
||||
private const val LEADING_SILENCE_SECONDS = 0.3
|
||||
// Stille am Ende — puffert das Hardware-Flushen damit die letzten
|
||||
// echten Samples garantiert ausgespielt werden bevor stop() kommt.
|
||||
private const val TRAILING_SILENCE_SECONDS = 0.3
|
||||
}
|
||||
|
||||
override fun getName() = "PcmStreamPlayer"
|
||||
|
||||
private var track: AudioTrack? = null
|
||||
private val queue = LinkedBlockingQueue<ByteArray>()
|
||||
private var writerThread: Thread? = null
|
||||
@Volatile private var writerShouldStop = false
|
||||
@Volatile private var endRequested = false
|
||||
@Volatile private var prerollBytes: Int = 0
|
||||
@Volatile private var playbackStarted = false
|
||||
@Volatile private var bytesBuffered: Long = 0
|
||||
@Volatile private var streamBytesPerFrame: Int = 2 // mono s16le default
|
||||
|
||||
// ── Lifecycle ──
|
||||
|
||||
@ReactMethod
|
||||
fun start(sampleRate: Int, channels: Int, prerollSeconds: Double, promise: Promise) {
|
||||
try {
|
||||
// Alte Session beenden falls vorhanden
|
||||
stopInternal()
|
||||
|
||||
// Nur NaN/Inf → Default. 0.0 ist gueltig (= sofortige Wiedergabe).
|
||||
val prerollSec = if (prerollSeconds.isFinite() && prerollSeconds >= 0.0) {
|
||||
prerollSeconds.coerceIn(MIN_PREROLL_SECONDS, MAX_PREROLL_SECONDS)
|
||||
} else {
|
||||
DEFAULT_PREROLL_SECONDS
|
||||
}
|
||||
|
||||
val channelConfig = if (channels == 2) AudioFormat.CHANNEL_OUT_STEREO else AudioFormat.CHANNEL_OUT_MONO
|
||||
val encoding = AudioFormat.ENCODING_PCM_16BIT
|
||||
val minBuf = AudioTrack.getMinBufferSize(sampleRate, channelConfig, encoding)
|
||||
val bytesPerSecond = sampleRate * channels * 2 // 16-bit = 2 bytes
|
||||
val prerollTarget = (bytesPerSecond * prerollSec).toInt()
|
||||
// Buffer entkoppelt von Preroll — fester ~4s-Buffer. OnePlus A12
|
||||
// mit USAGE_ASSISTANT laeuft AudioTrack erst ab ~3s gepufferter
|
||||
// Daten an. Wir padden Kurztexte vor play() auf 3s (siehe Block
|
||||
// nach mainLoop), Buffer braucht ~1s Headroom weil write() blockt.
|
||||
val bufferSize = (bytesPerSecond * 4).coerceAtLeast(minBuf * 8)
|
||||
prerollBytes = prerollTarget
|
||||
bytesBuffered = 0
|
||||
playbackStarted = false
|
||||
streamBytesPerFrame = channels * 2 // s16 = 2 bytes per sample
|
||||
|
||||
val newTrack = AudioTrack.Builder()
|
||||
.setAudioAttributes(
|
||||
AudioAttributes.Builder()
|
||||
.setUsage(AudioAttributes.USAGE_ASSISTANT)
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.build(),
|
||||
)
|
||||
.setAudioFormat(
|
||||
AudioFormat.Builder()
|
||||
.setSampleRate(sampleRate)
|
||||
.setChannelMask(channelConfig)
|
||||
.setEncoding(encoding)
|
||||
.build(),
|
||||
)
|
||||
.setBufferSizeInBytes(bufferSize)
|
||||
.setTransferMode(AudioTrack.MODE_STREAM)
|
||||
.build()
|
||||
|
||||
// Start-Threshold runterdrehen: Default ist bufferSize/2 (= 2s bei 4s
|
||||
// Buffer). AudioTrack startet sonst nicht bevor 2s im Puffer sind —
|
||||
// bei kurzen TTS-Antworten (3 Worte ~ 1.4s) bleibt pos auf 0 stehen.
|
||||
// 0.1s reicht damit AudioTrack sofort mit dem ersten Chunk anlaeuft.
|
||||
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
|
||||
try {
|
||||
val startFrames = (sampleRate / 10).coerceAtLeast(1) // 100ms
|
||||
newTrack.setStartThresholdInFrames(startFrames)
|
||||
Log.i(TAG, "Start-Threshold gesetzt: ${startFrames} frames (~100ms)")
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "setStartThresholdInFrames failed: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
track = newTrack
|
||||
queue.clear()
|
||||
writerShouldStop = false
|
||||
endRequested = false
|
||||
|
||||
writerThread = Thread({
|
||||
val t = track ?: return@Thread
|
||||
try {
|
||||
// Leading-Silence in den Buffer — gibt AudioTrack Zeit anzufahren.
|
||||
val leadingBytes = ((sampleRate * channels * 2) * LEADING_SILENCE_SECONDS).toInt() and 0x7FFFFFFE
|
||||
if (leadingBytes > 0) {
|
||||
val silence = ByteArray(leadingBytes)
|
||||
var silOff = 0
|
||||
while (silOff < silence.size && !writerShouldStop) {
|
||||
val w = t.write(silence, silOff, silence.size - silOff)
|
||||
if (w <= 0) break
|
||||
silOff += w
|
||||
}
|
||||
bytesBuffered += silence.size
|
||||
}
|
||||
// Bei preroll=0: play() SOFORT nach Leading-Silence aufrufen,
|
||||
// nicht erst bei Ankunft des ersten echten Chunks. Android's
|
||||
// AudioTrack haelt den Play-State und wartet auf neue Samples.
|
||||
// So verschluckt es keine Worte wenn der erste Chunk erst
|
||||
// nach play()-Startup-Latenz eintrifft.
|
||||
if (prerollBytes == 0 && !playbackStarted) {
|
||||
try {
|
||||
t.play()
|
||||
playbackStarted = true
|
||||
Log.i(TAG, "Playback sofort gestartet (preroll=0, ${bytesBuffered}B silence)")
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "play() sofort failed: ${e.message}")
|
||||
}
|
||||
}
|
||||
// Idle-Cutoff: wenn endRequested NICHT kam aber lange nichts mehr
|
||||
// reinkommt, brechen wir ab (Bridge-Crash, verlorener final).
|
||||
// 120s damit lange F5-TTS-Render-Pausen zwischen Saetzen (z.B. bei
|
||||
// Modell-Wechsel oder kalter GPU) nicht den Stream abreissen.
|
||||
var idleMs = 0L
|
||||
val maxIdleMs = 120_000L
|
||||
// Zielpufferfuellung — unter diesem Wasserstand fuettern wir
|
||||
// Stille rein damit AudioTrack nicht underrunt waehrend die
|
||||
// Bridge den naechsten Satz rendert. Spotify/YouTube reagieren
|
||||
// sonst mit eigenmaechtiger Wiederaufnahme nach ~10s Stille.
|
||||
val underrunGuardFrames = sampleRate / 10 // ~100ms
|
||||
val silenceFillFrames = sampleRate / 20 // ~50ms pro Refill
|
||||
|
||||
mainLoop@ while (!writerShouldStop) {
|
||||
val data = queue.poll(50, java.util.concurrent.TimeUnit.MILLISECONDS)
|
||||
if (data == null) {
|
||||
if (endRequested) {
|
||||
// Falls play() noch gar nicht lief (Stream ohne data
|
||||
// ueberhaupt — sehr seltene Edge-Case): jetzt anstossen
|
||||
// damit das finally{}-Wait nicht endlos blockt.
|
||||
if (!playbackStarted) {
|
||||
try { t.play(); playbackStarted = true } catch (_: Exception) {}
|
||||
}
|
||||
break@mainLoop
|
||||
}
|
||||
// Underrun-Schutz: Stille reinfuettern wenn der AudioTrack-
|
||||
// Puffer leerzulaufen droht. Spotify resumed sonst nach
|
||||
// ~10s Pause auf eigene Faust, obwohl wir den Fokus halten.
|
||||
if (playbackStarted) {
|
||||
val framesWritten = bytesBuffered / streamBytesPerFrame
|
||||
val framesPlayed = t.playbackHeadPosition.toLong()
|
||||
val framesInBuffer = framesWritten - framesPlayed
|
||||
if (framesInBuffer < underrunGuardFrames) {
|
||||
val fillBytes = silenceFillFrames * streamBytesPerFrame
|
||||
val silence = ByteArray(fillBytes)
|
||||
var silOff = 0
|
||||
while (silOff < silence.size && !writerShouldStop) {
|
||||
val w = t.write(silence, silOff, silence.size - silOff)
|
||||
if (w <= 0) break
|
||||
silOff += w
|
||||
}
|
||||
bytesBuffered += silence.size
|
||||
}
|
||||
}
|
||||
idleMs += 50L
|
||||
if (idleMs >= maxIdleMs) {
|
||||
Log.w(TAG, "Idle-Cutoff: ${maxIdleMs}ms keine Daten — Stream wird beendet")
|
||||
break@mainLoop
|
||||
}
|
||||
continue@mainLoop
|
||||
}
|
||||
idleMs = 0L
|
||||
|
||||
// play() beim ALLERERSTEN data-chunk aufrufen — egal wie wenig
|
||||
// Daten da sind. Sonst stallt AudioTrack auf OnePlus A12 wenn
|
||||
// play() erst gerufen wird nachdem der Buffer komplett gefuellt
|
||||
// ist. Pre-Roll als "Vorrat aufbauen" passiert dann waehrend
|
||||
// der Track schon spielt — Underrun-Schutz fuettert ggf. Stille.
|
||||
if (!playbackStarted) {
|
||||
try {
|
||||
t.play()
|
||||
playbackStarted = true
|
||||
Log.i(TAG, "Playback gestartet beim 1. Chunk (${bytesBuffered}B leading + ${data.size}B data)")
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "play() failed: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
var offset = 0
|
||||
while (offset < data.size && !writerShouldStop) {
|
||||
val written = t.write(data, offset, data.size - offset)
|
||||
if (written <= 0) break
|
||||
offset += written
|
||||
}
|
||||
bytesBuffered += data.size
|
||||
}
|
||||
// Trailing-Silence damit die letzten echten Samples garantiert
|
||||
// durch das Hardware-Buffering kommen bevor stop() sie abschneidet
|
||||
val trailingBytes = ((sampleRate * channels * 2) * TRAILING_SILENCE_SECONDS).toInt() and 0x7FFFFFFE
|
||||
if (trailingBytes > 0 && !writerShouldStop) {
|
||||
val silence = ByteArray(trailingBytes)
|
||||
var silOff = 0
|
||||
while (silOff < silence.size && !writerShouldStop) {
|
||||
val w = t.write(silence, silOff, silence.size - silOff)
|
||||
if (w <= 0) break
|
||||
silOff += w
|
||||
}
|
||||
bytesBuffered += silence.size
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "Writer-Thread Fehler: ${e.message}")
|
||||
} finally {
|
||||
// Warten bis alle geschriebenen Samples tatsaechlich abgespielt sind,
|
||||
// sonst cuttet t.release() die letzten Sekunden ab.
|
||||
try {
|
||||
val totalFrames = (bytesBuffered / streamBytesPerFrame).toInt()
|
||||
var lastPos = -1
|
||||
var stalledCount = 0
|
||||
var retried = false
|
||||
while (!writerShouldStop) {
|
||||
val pos = t.playbackHeadPosition
|
||||
if (pos >= totalFrames) break
|
||||
if (pos == lastPos) {
|
||||
stalledCount++
|
||||
// Nach 500ms Stillstand: AudioTrack-Quirk auf manchen
|
||||
// Geraeten (OnePlus A12) — play() nochmal anstossen.
|
||||
if (stalledCount == 10 && pos == 0 && !retried) {
|
||||
retried = true
|
||||
Log.w(TAG, "playback nicht angefahren — retry play()")
|
||||
try { t.play() } catch (e: Exception) {
|
||||
Log.w(TAG, "retry play() failed: ${e.message}")
|
||||
}
|
||||
}
|
||||
if (stalledCount > 40) {
|
||||
Log.w(TAG, "playback stalled at $pos/$totalFrames — give up")
|
||||
break
|
||||
}
|
||||
} else {
|
||||
stalledCount = 0
|
||||
lastPos = pos
|
||||
}
|
||||
Thread.sleep(50)
|
||||
}
|
||||
Log.i(TAG, "Playback fertig: frames=$totalFrames pos=${t.playbackHeadPosition}")
|
||||
} catch (_: Exception) {}
|
||||
try { t.stop() } catch (_: Exception) {}
|
||||
try { t.release() } catch (_: Exception) {}
|
||||
// RN-Event: AudioTrack ist wirklich durch (alle Samples gespielt).
|
||||
// JS released erst JETZT den AudioFocus — sonst spielt Spotify
|
||||
// beim end()-Cap waehrend ARIA noch redet (15s+ je nach Buffer).
|
||||
try {
|
||||
val params = Arguments.createMap()
|
||||
reactApplicationContext
|
||||
.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
|
||||
.emit("PcmPlaybackFinished", params)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "PlaybackFinished emit failed: ${e.message}")
|
||||
}
|
||||
}
|
||||
}, "PcmStreamWriter").apply { start() }
|
||||
|
||||
Log.i(TAG, "Stream gestartet: ${sampleRate}Hz ch=$channels buf=${bufferSize}B preroll=${prerollBytes}B (${prerollSec}s)")
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "start fehlgeschlagen", e)
|
||||
promise.reject("START_FAILED", e.message, e)
|
||||
}
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun writeChunk(base64Pcm: String, promise: Promise) {
|
||||
try {
|
||||
if (base64Pcm.isEmpty()) {
|
||||
promise.resolve(true)
|
||||
return
|
||||
}
|
||||
val bytes = Base64.decode(base64Pcm, Base64.DEFAULT)
|
||||
queue.put(bytes)
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
promise.reject("WRITE_FAILED", e.message, e)
|
||||
}
|
||||
}
|
||||
|
||||
/** Signalisiert: keine weiteren Chunks. Writer spielt aus, dann stoppt.
|
||||
* Das Promise resolved erst wenn der Writer-Thread fertig ist —
|
||||
* wichtig damit der Aufrufer den AudioFocus erst NACH dem letzten
|
||||
* abgespielten Sample wieder freigibt (sonst dreht Spotify hoch
|
||||
* waehrend das Pre-Roll noch ausspielt).
|
||||
*/
|
||||
@ReactMethod
|
||||
fun end(promise: Promise) {
|
||||
endRequested = true
|
||||
val t = writerThread
|
||||
if (t == null || !t.isAlive) {
|
||||
promise.resolve(true)
|
||||
return
|
||||
}
|
||||
// Im Hintergrund auf den Writer warten — kein Threading-Block fuer JS-Bridge
|
||||
Thread({
|
||||
try {
|
||||
t.join(15_000) // hartes Cap, falls Writer haengt
|
||||
} catch (_: InterruptedException) {}
|
||||
promise.resolve(true)
|
||||
}, "PcmStreamEndWaiter").start()
|
||||
}
|
||||
|
||||
/** Harter Stop (Cancel) — Queue verwerfen. */
|
||||
@ReactMethod
|
||||
fun stop(promise: Promise) {
|
||||
stopInternal()
|
||||
promise.resolve(true)
|
||||
}
|
||||
|
||||
@ReactMethod fun addListener(eventName: String) {}
|
||||
@ReactMethod fun removeListeners(count: Int) {}
|
||||
|
||||
private fun stopInternal() {
|
||||
writerShouldStop = true
|
||||
endRequested = true
|
||||
queue.clear()
|
||||
writerThread?.interrupt()
|
||||
writerThread = null
|
||||
val t = track
|
||||
if (t != null) {
|
||||
try { t.stop() } catch (_: Exception) {}
|
||||
try { t.release() } catch (_: Exception) {}
|
||||
}
|
||||
track = null
|
||||
}
|
||||
|
||||
override fun onCatalystInstanceDestroy() {
|
||||
stopInternal()
|
||||
super.onCatalystInstanceDestroy()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import com.facebook.react.ReactPackage
|
||||
import com.facebook.react.bridge.NativeModule
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.uimanager.ViewManager
|
||||
|
||||
class PcmStreamPlayerPackage : ReactPackage {
|
||||
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
|
||||
return listOf(PcmStreamPlayerModule(reactContext))
|
||||
}
|
||||
|
||||
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
|
||||
return emptyList()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,126 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import android.Manifest
|
||||
import android.content.Context
|
||||
import android.content.pm.PackageManager
|
||||
import android.os.Build
|
||||
import android.telephony.PhoneStateListener
|
||||
import android.telephony.TelephonyCallback
|
||||
import android.telephony.TelephonyManager
|
||||
import android.util.Log
|
||||
import androidx.core.content.ContextCompat
|
||||
import com.facebook.react.bridge.Arguments
|
||||
import com.facebook.react.bridge.Promise
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.bridge.ReactContextBaseJavaModule
|
||||
import com.facebook.react.bridge.ReactMethod
|
||||
import com.facebook.react.modules.core.DeviceEventManagerModule
|
||||
|
||||
/**
|
||||
* Lauscht auf Anruf-Statusaenderungen — wenn das Telefon klingelt oder ein
|
||||
* Anruf laeuft, sendet das Modul ein "PhoneCallStateChanged"-Event an JS.
|
||||
*
|
||||
* JS-Side stoppt dann die TTS-Wiedergabe damit ARIA nicht mitten ins Gespraech
|
||||
* weiterredet. Ohne READ_PHONE_STATE-Permission failt start() leise — der Rest
|
||||
* der App funktioniert wie bisher.
|
||||
*
|
||||
* State-Strings: "idle" | "ringing" | "offhook"
|
||||
*/
|
||||
class PhoneCallModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||
override fun getName() = "PhoneCall"
|
||||
|
||||
companion object { private const val TAG = "PhoneCall" }
|
||||
|
||||
private var telephonyManager: TelephonyManager? = null
|
||||
private var legacyListener: PhoneStateListener? = null
|
||||
private var modernCallback: Any? = null // TelephonyCallback ab API 31
|
||||
private var lastState: Int = TelephonyManager.CALL_STATE_IDLE
|
||||
|
||||
@ReactMethod
|
||||
fun start(promise: Promise) {
|
||||
try {
|
||||
val perm = ContextCompat.checkSelfPermission(reactApplicationContext, Manifest.permission.READ_PHONE_STATE)
|
||||
if (perm != PackageManager.PERMISSION_GRANTED) {
|
||||
Log.w(TAG, "READ_PHONE_STATE Permission fehlt — Anruf-Erkennung inaktiv")
|
||||
promise.resolve(false)
|
||||
return
|
||||
}
|
||||
val tm = reactApplicationContext.getSystemService(Context.TELEPHONY_SERVICE) as? TelephonyManager
|
||||
if (tm == null) {
|
||||
Log.w(TAG, "TelephonyManager nicht verfuegbar")
|
||||
promise.resolve(false)
|
||||
return
|
||||
}
|
||||
telephonyManager = tm
|
||||
|
||||
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
|
||||
val cb = object : TelephonyCallback(), TelephonyCallback.CallStateListener {
|
||||
override fun onCallStateChanged(state: Int) {
|
||||
handleStateChange(state)
|
||||
}
|
||||
}
|
||||
tm.registerTelephonyCallback(reactApplicationContext.mainExecutor, cb)
|
||||
modernCallback = cb
|
||||
} else {
|
||||
@Suppress("DEPRECATION")
|
||||
val l = object : PhoneStateListener() {
|
||||
override fun onCallStateChanged(state: Int, phoneNumber: String?) {
|
||||
handleStateChange(state)
|
||||
}
|
||||
}
|
||||
@Suppress("DEPRECATION")
|
||||
tm.listen(l, PhoneStateListener.LISTEN_CALL_STATE)
|
||||
legacyListener = l
|
||||
}
|
||||
Log.i(TAG, "PhoneCall-Listener aktiv")
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "start fehlgeschlagen", e)
|
||||
promise.reject("START_FAILED", e.message ?: "Unbekannter Fehler", e)
|
||||
}
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun stop(promise: Promise) {
|
||||
try {
|
||||
val tm = telephonyManager
|
||||
if (tm != null) {
|
||||
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
|
||||
(modernCallback as? TelephonyCallback)?.let { tm.unregisterTelephonyCallback(it) }
|
||||
modernCallback = null
|
||||
} else {
|
||||
@Suppress("DEPRECATION")
|
||||
legacyListener?.let { tm.listen(it, PhoneStateListener.LISTEN_NONE) }
|
||||
legacyListener = null
|
||||
}
|
||||
}
|
||||
telephonyManager = null
|
||||
lastState = TelephonyManager.CALL_STATE_IDLE
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
promise.reject("STOP_FAILED", e.message ?: "")
|
||||
}
|
||||
}
|
||||
|
||||
private fun handleStateChange(state: Int) {
|
||||
if (state == lastState) return
|
||||
lastState = state
|
||||
val name = when (state) {
|
||||
TelephonyManager.CALL_STATE_RINGING -> "ringing"
|
||||
TelephonyManager.CALL_STATE_OFFHOOK -> "offhook"
|
||||
TelephonyManager.CALL_STATE_IDLE -> "idle"
|
||||
else -> return
|
||||
}
|
||||
Log.i(TAG, "Telefon-State: $name")
|
||||
val params = Arguments.createMap().apply { putString("state", name) }
|
||||
try {
|
||||
reactApplicationContext.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
|
||||
.emit("PhoneCallStateChanged", params)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "Event-emit fehlgeschlagen: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
@ReactMethod fun addListener(eventName: String) {}
|
||||
@ReactMethod fun removeListeners(count: Int) {}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import com.facebook.react.ReactPackage
|
||||
import com.facebook.react.bridge.NativeModule
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.uimanager.ViewManager
|
||||
|
||||
class PhoneCallPackage : ReactPackage {
|
||||
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
|
||||
return listOf(PhoneCallModule(reactContext))
|
||||
}
|
||||
|
||||
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
|
||||
return emptyList()
|
||||
}
|
||||
}
|
||||
Binary file not shown.
@@ -1,4 +1,8 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<paths>
|
||||
<cache-path name="cache" path="." />
|
||||
<files-path name="files" path="." />
|
||||
<external-path name="external" path="." />
|
||||
<external-files-path name="external_files" path="." />
|
||||
<external-cache-path name="external_cache" path="." />
|
||||
</paths>
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
buildscript {
|
||||
ext {
|
||||
buildToolsVersion = "34.0.0"
|
||||
minSdkVersion = 23
|
||||
// 24 = Android 7.0 (Nougat). Verlangt von Porcupine (Picovoice).
|
||||
// Realistisch eh das Minimum: alles unter 7.0 hat <1% Marktanteil.
|
||||
minSdkVersion = 24
|
||||
compileSdkVersion = 34
|
||||
targetSdkVersion = 34
|
||||
ndkVersion = "25.1.8937393"
|
||||
|
||||
+15
-2
@@ -167,10 +167,23 @@ export CI=true
|
||||
|
||||
if [ "$MODE" = "debug" ]; then
|
||||
./gradlew assembleDebug
|
||||
APK_PATH="app/build/outputs/apk/debug/app-debug.apk"
|
||||
OUT_DIR="app/build/outputs/apk/debug"
|
||||
else
|
||||
./gradlew assembleRelease
|
||||
APK_PATH="app/build/outputs/apk/release/app-release.apk"
|
||||
OUT_DIR="app/build/outputs/apk/release"
|
||||
fi
|
||||
|
||||
# Mit ABI-Splits heisst die APK z.B. app-arm64-v8a-release.apk statt
|
||||
# app-release.apk. arm64-v8a-Variante zuerst probieren (das ist unser
|
||||
# Standard), Universal-APK als Fallback falls Splits deaktiviert sind.
|
||||
if [ -f "$OUT_DIR/app-arm64-v8a-${MODE}.apk" ]; then
|
||||
APK_PATH="$OUT_DIR/app-arm64-v8a-${MODE}.apk"
|
||||
elif [ -f "$OUT_DIR/app-${MODE}.apk" ]; then
|
||||
APK_PATH="$OUT_DIR/app-${MODE}.apk"
|
||||
else
|
||||
echo -e "${RED}Keine passende APK in $OUT_DIR gefunden${NC}"
|
||||
cd ..
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd ..
|
||||
|
||||
+18
-17
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "aria-cockpit",
|
||||
"version": "0.0.3.9",
|
||||
"version": "0.1.2.2",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"android": "react-native run-android",
|
||||
@@ -10,31 +10,32 @@
|
||||
"build:apk": "cd android && ./gradlew assembleRelease"
|
||||
},
|
||||
"dependencies": {
|
||||
"@react-native-async-storage/async-storage": "^1.21.0",
|
||||
"@react-native-community/geolocation": "^3.2.1",
|
||||
"@react-navigation/bottom-tabs": "^6.5.11",
|
||||
"@react-navigation/native": "^6.1.9",
|
||||
"react": "18.2.0",
|
||||
"react-native": "0.73.4",
|
||||
"@react-navigation/native": "^6.1.9",
|
||||
"@react-navigation/bottom-tabs": "^6.5.11",
|
||||
"react-native-screens": "3.27.0",
|
||||
"react-native-safe-area-context": "^4.8.2",
|
||||
"react-native-audio-recorder-player": "^3.6.7",
|
||||
"react-native-camera-kit": "^13.0.0",
|
||||
"react-native-document-picker": "^9.1.1",
|
||||
"react-native-sound": "^0.11.2",
|
||||
"@react-native-community/geolocation": "^3.2.1",
|
||||
"react-native-fs": "^2.20.0",
|
||||
"react-native-image-picker": "^7.1.0",
|
||||
"react-native-permissions": "^4.1.4",
|
||||
"react-native-camera-kit": "^13.0.0",
|
||||
"@react-native-async-storage/async-storage": "^1.21.0",
|
||||
"react-native-fs": "^2.20.0",
|
||||
"react-native-audio-recorder-player": "^3.6.7"
|
||||
"react-native-safe-area-context": "^4.8.2",
|
||||
"react-native-screens": "3.27.0",
|
||||
"react-native-sound": "^0.11.2",
|
||||
"react-native-svg": "^14.1.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"typescript": "^5.3.3",
|
||||
"@react-native/eslint-config": "^0.73.2",
|
||||
"@react-native/metro-config": "^0.73.5",
|
||||
"@react-native/typescript-config": "^0.73.1",
|
||||
"@types/jest": "^29.5.11",
|
||||
"@types/react": "^18.2.48",
|
||||
"@types/react-native": "^0.73.0",
|
||||
"@react-native/eslint-config": "^0.73.2",
|
||||
"@react-native/typescript-config": "^0.73.1",
|
||||
"@react-native/metro-config": "^0.73.5",
|
||||
"metro-react-native-babel-preset": "^0.77.0",
|
||||
"jest": "^29.7.0",
|
||||
"@types/jest": "^29.5.11"
|
||||
"metro-react-native-babel-preset": "^0.77.0",
|
||||
"typescript": "^5.3.3"
|
||||
}
|
||||
}
|
||||
|
||||
Binary file not shown.
@@ -0,0 +1,88 @@
|
||||
/**
|
||||
* MessageText — selektierbarer Chat-Text mit Android-Auto-Linkifizierung,
|
||||
* plus Inline-Image-Rendering wenn der Text Bild-URLs enthaelt.
|
||||
*
|
||||
* - Markdown-Syntax `` und plain `https://...image.png` werden
|
||||
* erkannt — die URL bleibt im Text sichtbar (klickbar via Linkify),
|
||||
* zusaetzlich wird das Bild als <Image> oder <SvgUri> drunter gerendert.
|
||||
* - Wir nutzen Androids dataDetectorType="all" (System macht Phone/URL/Email
|
||||
* automatisch klickbar) und ein einzelnes <Text selectable> ohne nested
|
||||
* <Text> mit eigenem onPress — Nested Text mit onPress fing die Long-Press-
|
||||
* Geste ab, damit war Markieren+Kopieren defekt.
|
||||
*/
|
||||
|
||||
import React, { useEffect, useState } from 'react';
|
||||
import { View, Text, Image, TextStyle, StyleProp } from 'react-native';
|
||||
import { SvgUri } from 'react-native-svg';
|
||||
|
||||
interface Props {
|
||||
text: string;
|
||||
style?: StyleProp<TextStyle>;
|
||||
}
|
||||
|
||||
// Bild-URL-Pattern: http(s)://... endend auf gaengige Bild-Endungen.
|
||||
const IMG_URL_RE = /https?:\/\/[^\s)<"']+\.(?:jpe?g|png|gif|webp|bmp|ico|svg)(?:\?[^\s)<"']*)?/gi;
|
||||
|
||||
function extractImageUrls(text: string): string[] {
|
||||
const urls = new Set<string>();
|
||||
const matches = text.match(IMG_URL_RE);
|
||||
if (matches) matches.forEach(u => urls.add(u));
|
||||
return Array.from(urls);
|
||||
}
|
||||
|
||||
const SVG_RE = /\.svg(?:\?|$)/i;
|
||||
|
||||
/** Image mit dynamischer Aspect-Ratio aus echten Bilddimensionen.
|
||||
* SVGs werden ueber react-native-svg gerendert (kein Image.getSize). */
|
||||
const InlineImage: React.FC<{ uri: string }> = ({ uri }) => {
|
||||
const isSvg = SVG_RE.test(uri);
|
||||
const [aspectRatio, setAspectRatio] = useState<number>(1);
|
||||
const [failed, setFailed] = useState(false);
|
||||
useEffect(() => {
|
||||
if (isSvg) return; // Image.getSize geht fuer SVG nicht
|
||||
let cancelled = false;
|
||||
Image.getSize(
|
||||
uri,
|
||||
(w, h) => { if (!cancelled && w > 0 && h > 0) setAspectRatio(Math.max(0.5, Math.min(2.5, w / h))); },
|
||||
() => { if (!cancelled) setFailed(true); },
|
||||
);
|
||||
return () => { cancelled = true; };
|
||||
}, [uri, isSvg]);
|
||||
if (failed) return null;
|
||||
if (isSvg) {
|
||||
return (
|
||||
<View style={{ marginTop: 8, width: 260, height: 260, backgroundColor: '#0D0D1A', borderRadius: 8, alignItems: 'center', justifyContent: 'center' }}>
|
||||
<SvgUri uri={uri} width="100%" height="100%" onError={() => setFailed(true)} />
|
||||
</View>
|
||||
);
|
||||
}
|
||||
return (
|
||||
<Image
|
||||
source={{ uri }}
|
||||
style={{ width: 260, aspectRatio, borderRadius: 8, marginTop: 8, backgroundColor: '#0D0D1A' }}
|
||||
resizeMode="cover"
|
||||
onError={() => setFailed(true)}
|
||||
/>
|
||||
);
|
||||
};
|
||||
|
||||
const MessageText: React.FC<Props> = ({ text, style }) => {
|
||||
const imageUrls = extractImageUrls(text || '');
|
||||
if (imageUrls.length === 0) {
|
||||
return (
|
||||
<Text style={style} selectable dataDetectorType="all">
|
||||
{text}
|
||||
</Text>
|
||||
);
|
||||
}
|
||||
return (
|
||||
<View>
|
||||
<Text style={style} selectable dataDetectorType="all">
|
||||
{text}
|
||||
</Text>
|
||||
{imageUrls.map(u => <InlineImage key={u} uri={u} />)}
|
||||
</View>
|
||||
);
|
||||
};
|
||||
|
||||
export default MessageText;
|
||||
@@ -44,7 +44,6 @@ const VoiceButton: React.FC<VoiceButtonProps> = ({
|
||||
const [meterDb, setMeterDb] = useState(-160);
|
||||
const pulseAnim = useRef(new Animated.Value(1)).current;
|
||||
const durationTimer = useRef<ReturnType<typeof setInterval> | null>(null);
|
||||
const isLongPress = useRef(false);
|
||||
|
||||
// Puls-Animation starten/stoppen
|
||||
useEffect(() => {
|
||||
@@ -93,66 +92,62 @@ const VoiceButton: React.FC<VoiceButtonProps> = ({
|
||||
}
|
||||
}, [isRecording]);
|
||||
|
||||
// VAD Silence Callback — Auto-Stop
|
||||
// VAD Silence Callback — Auto-Stop.
|
||||
// WICHTIG: NICHT auf isRecording prüfen (Closure ist stale) — stattdessen
|
||||
// audioService selber fragen. Empty deps → Listener wird EINMAL registriert.
|
||||
// audioService garantiert jetzt dass der Callback pro Aufnahme nur einmal
|
||||
// feuert (silenceFired-Latch).
|
||||
const onCompleteRef = useRef(onRecordingComplete);
|
||||
useEffect(() => { onCompleteRef.current = onRecordingComplete; }, [onRecordingComplete]);
|
||||
useEffect(() => {
|
||||
const unsubSilence = audioService.onSilenceDetected(async () => {
|
||||
if (!isRecording) return;
|
||||
setIsRecording(false);
|
||||
if (audioService.getRecordingState() !== 'recording') return;
|
||||
const result = await audioService.stopRecording();
|
||||
setIsRecording(false);
|
||||
if (result && result.durationMs > 500) {
|
||||
onRecordingComplete(result);
|
||||
onCompleteRef.current(result);
|
||||
}
|
||||
});
|
||||
return unsubSilence;
|
||||
}, [isRecording, onRecordingComplete]);
|
||||
}, []);
|
||||
|
||||
// Auto-Start fuer Wake Word (extern getriggert)
|
||||
const startAutoRecording = useCallback(async () => {
|
||||
if (disabled || isRecording) return;
|
||||
const started = await audioService.startRecording(true); // autoStop = true
|
||||
if (started) {
|
||||
isLongPress.current = false;
|
||||
setIsRecording(true);
|
||||
}
|
||||
}, [disabled, isRecording]);
|
||||
|
||||
// Push-to-Talk: Lang druecken
|
||||
const handlePressIn = async () => {
|
||||
if (disabled || isRecording) return;
|
||||
isLongPress.current = true;
|
||||
const started = await audioService.startRecording(false); // kein autoStop
|
||||
if (started) {
|
||||
setIsRecording(true);
|
||||
}
|
||||
};
|
||||
|
||||
const handlePressOut = async () => {
|
||||
if (!isRecording || !isLongPress.current) return;
|
||||
isLongPress.current = false;
|
||||
setIsRecording(false);
|
||||
const result = await audioService.stopRecording();
|
||||
if (result && result.durationMs > 300) {
|
||||
onRecordingComplete(result);
|
||||
}
|
||||
};
|
||||
|
||||
// Tap-to-Talk: Einmal tippen startet mit Auto-Stop
|
||||
// Tap-to-Talk: Einmal tippen startet mit Auto-Stop.
|
||||
// Guard gegen Doppel-Tap während asyncer Start/Stop.
|
||||
const tapBusy = useRef(false);
|
||||
const handleTap = async () => {
|
||||
if (disabled) return;
|
||||
if (isRecording) {
|
||||
// Aufnahme manuell stoppen
|
||||
setIsRecording(false);
|
||||
const result = await audioService.stopRecording();
|
||||
if (result && result.durationMs > 300) {
|
||||
onRecordingComplete(result);
|
||||
}
|
||||
} else {
|
||||
// Aufnahme mit Auto-Stop starten
|
||||
const started = await audioService.startRecording(true);
|
||||
if (started) {
|
||||
isLongPress.current = false;
|
||||
setIsRecording(true);
|
||||
if (disabled || tapBusy.current) return;
|
||||
tapBusy.current = true;
|
||||
try {
|
||||
// Fragen WIR den Service, nicht den React-State (Closure kann stale sein)
|
||||
const svcState = audioService.getRecordingState();
|
||||
if (svcState === 'recording') {
|
||||
// Aufnahme manuell stoppen
|
||||
const result = await audioService.stopRecording();
|
||||
setIsRecording(false);
|
||||
if (result && result.durationMs > 300) {
|
||||
onRecordingComplete(result);
|
||||
}
|
||||
} else if (svcState === 'idle') {
|
||||
// Aufnahme mit Auto-Stop starten
|
||||
const started = await audioService.startRecording(true);
|
||||
if (started) {
|
||||
setIsRecording(true);
|
||||
}
|
||||
}
|
||||
// svcState === 'processing': Stopp in progress — nichts tun, User
|
||||
// muss nochmal tippen wenn fertig. Aber wir blockieren mit tapBusy
|
||||
// kurz damit der User's UI-Feedback synchron bleibt.
|
||||
} finally {
|
||||
tapBusy.current = false;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -183,10 +178,6 @@ const VoiceButton: React.FC<VoiceButtonProps> = ({
|
||||
isRecording && styles.buttonOuterRecording,
|
||||
{ transform: [{ scale: pulseAnim }] },
|
||||
]}
|
||||
onStartShouldSetResponder={() => true}
|
||||
onResponderGrant={handlePressIn}
|
||||
onResponderRelease={handlePressOut}
|
||||
onResponderTerminate={handlePressOut}
|
||||
>
|
||||
<TouchableOpacity
|
||||
activeOpacity={0.8}
|
||||
|
||||
@@ -0,0 +1,362 @@
|
||||
/**
|
||||
* VoiceCloneModal — Eigene Stimme aufnehmen und an XTTS uploaden.
|
||||
*
|
||||
* Flow:
|
||||
* - Modal zeigt Vorlesetext (>30s Lesedauer) + Aufnahme-Button
|
||||
* - Bei Aufnahme: max 30s, Fortschrittsbalken, Countdown
|
||||
* - Bei Stop: Name abfragen, dann als voice_upload ueber RVS schicken
|
||||
* - XTTS-Bridge speichert /voices/<name>.wav, antwortet mit xtts_voice_saved
|
||||
*/
|
||||
|
||||
import React, { useCallback, useEffect, useRef, useState } from 'react';
|
||||
import {
|
||||
Modal,
|
||||
View,
|
||||
Text,
|
||||
TouchableOpacity,
|
||||
StyleSheet,
|
||||
Alert,
|
||||
ScrollView,
|
||||
ActivityIndicator,
|
||||
TextInput,
|
||||
} from 'react-native';
|
||||
import audioService from '../services/audio';
|
||||
import rvs from '../services/rvs';
|
||||
|
||||
interface Props {
|
||||
visible: boolean;
|
||||
onClose: () => void;
|
||||
}
|
||||
|
||||
const SAMPLE_TEXT = `Das ist meine eigene Stimme fuer ARIA. Ich lese jetzt einen laengeren Absatz laut vor, damit das Voice-Cloning eine gute Grundlage hat. Guten Tag, ich heisse Stefan und baue gerade mit grosser Begeisterung an meinem persoenlichen KI-Assistenten. Wir automatisieren Infrastruktur, managen Sessions und spielen mit Sprachsynthese. Die letzten Jahre habe ich viel gelernt, vor allem dass Geduld genauso wichtig ist wie Neugier. Hoert sich das jetzt an wie ich selbst? Wenn alles klappt, spricht ARIA bald mit dieser Stimme.`;
|
||||
|
||||
const MAX_DURATION_MS = 30000;
|
||||
const TARGET_DURATION_MS = 15000;
|
||||
|
||||
const VoiceCloneModal: React.FC<Props> = ({ visible, onClose }) => {
|
||||
const [recording, setRecording] = useState(false);
|
||||
const [durationMs, setDurationMs] = useState(0);
|
||||
const [voiceName, setVoiceName] = useState('');
|
||||
const [processing, setProcessing] = useState(false);
|
||||
const [recordingPath, setRecordingPath] = useState('');
|
||||
const timerRef = useRef<ReturnType<typeof setInterval> | null>(null);
|
||||
const startTimeRef = useRef<number>(0);
|
||||
|
||||
// Zustand zuruecksetzen wenn Modal schliesst/oeffnet
|
||||
useEffect(() => {
|
||||
if (!visible) {
|
||||
setRecording(false);
|
||||
setDurationMs(0);
|
||||
setVoiceName('');
|
||||
setProcessing(false);
|
||||
setRecordingPath('');
|
||||
if (timerRef.current) clearInterval(timerRef.current);
|
||||
}
|
||||
}, [visible]);
|
||||
|
||||
// Cleanup bei Unmount
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
if (timerRef.current) clearInterval(timerRef.current);
|
||||
if (recording) audioService.stopRecording().catch(() => {});
|
||||
};
|
||||
}, [recording]);
|
||||
|
||||
const startRecording = useCallback(async () => {
|
||||
// Frische Aufnahme
|
||||
setDurationMs(0);
|
||||
setRecordingPath('');
|
||||
const ok = await audioService.startRecording(false);
|
||||
if (!ok) {
|
||||
Alert.alert('Fehler', 'Aufnahme konnte nicht gestartet werden (Mikrofon-Berechtigung?)');
|
||||
return;
|
||||
}
|
||||
setRecording(true);
|
||||
startTimeRef.current = Date.now();
|
||||
timerRef.current = setInterval(async () => {
|
||||
const elapsed = Date.now() - startTimeRef.current;
|
||||
setDurationMs(elapsed);
|
||||
if (elapsed >= MAX_DURATION_MS) {
|
||||
await stopRecording();
|
||||
}
|
||||
}, 100);
|
||||
}, []);
|
||||
|
||||
const stopRecording = useCallback(async () => {
|
||||
if (timerRef.current) {
|
||||
clearInterval(timerRef.current);
|
||||
timerRef.current = null;
|
||||
}
|
||||
if (!recording) return;
|
||||
const result = await audioService.stopRecording();
|
||||
setRecording(false);
|
||||
if (!result) {
|
||||
Alert.alert('Keine Sprache erkannt', 'Versuch es bitte nochmal — sprich bis der Timer mindestens 10 Sekunden anzeigt.');
|
||||
setDurationMs(0);
|
||||
return;
|
||||
}
|
||||
// Temp-Datei wurde schon geloescht (stopRecording cleaned up).
|
||||
// Wir brauchen aber base64 aus result direkt fuers Upload.
|
||||
// result.base64 ist bereits da.
|
||||
setRecordingPath(result.base64);
|
||||
}, [recording]);
|
||||
|
||||
const uploadVoice = useCallback(async () => {
|
||||
const name = voiceName.trim();
|
||||
if (!name) {
|
||||
Alert.alert('Name fehlt', 'Bitte gib der Stimme einen Namen (nur Buchstaben, Zahlen, _ und -).');
|
||||
return;
|
||||
}
|
||||
if (!/^[a-zA-Z0-9_-]+$/.test(name)) {
|
||||
Alert.alert('Ungueltiger Name', 'Nur Buchstaben, Zahlen, _ und - erlaubt.');
|
||||
return;
|
||||
}
|
||||
if (!recordingPath) {
|
||||
Alert.alert('Keine Aufnahme', 'Bitte zuerst aufnehmen.');
|
||||
return;
|
||||
}
|
||||
setProcessing(true);
|
||||
try {
|
||||
// voice_upload erwartet samples als Array mit base64 (aus Diagnostic-Format kopiert)
|
||||
rvs.send('voice_upload' as any, {
|
||||
name,
|
||||
samples: [{ base64: recordingPath }],
|
||||
});
|
||||
Alert.alert('Hochgeladen', `Stimme "${name}" wird vom XTTS-Server verarbeitet. Nach ein paar Sekunden in der Liste verfuegbar.`);
|
||||
onClose();
|
||||
} catch (err: any) {
|
||||
Alert.alert('Fehler', err.message);
|
||||
} finally {
|
||||
setProcessing(false);
|
||||
}
|
||||
}, [voiceName, recordingPath, onClose]);
|
||||
|
||||
const progress = Math.min(durationMs / MAX_DURATION_MS, 1);
|
||||
const sec = Math.floor(durationMs / 1000);
|
||||
const enoughRecorded = durationMs >= TARGET_DURATION_MS;
|
||||
|
||||
return (
|
||||
<Modal visible={visible} animationType="slide" onRequestClose={onClose}>
|
||||
<View style={styles.container}>
|
||||
<View style={styles.header}>
|
||||
<Text style={styles.title}>Eigene Stimme aufnehmen</Text>
|
||||
<TouchableOpacity onPress={onClose}>
|
||||
<Text style={styles.closeX}>{'\u2715'}</Text>
|
||||
</TouchableOpacity>
|
||||
</View>
|
||||
|
||||
<ScrollView style={styles.content} contentContainerStyle={{padding: 16}}>
|
||||
<Text style={styles.hint}>
|
||||
Lies den Text laut und deutlich vor. Maximal 30 Sekunden. Je mehr du sprichst
|
||||
(ziel: bis zum Ende des Textes, ca. 20-30s), desto besser wird die geklonte
|
||||
Stimme.
|
||||
</Text>
|
||||
|
||||
<View style={styles.sampleTextBox}>
|
||||
<Text style={styles.sampleText}>{SAMPLE_TEXT}</Text>
|
||||
</View>
|
||||
|
||||
{/* Timer + Fortschritt */}
|
||||
<View style={{marginTop: 20, alignItems: 'center'}}>
|
||||
<Text style={[styles.timer, recording && styles.timerActive]}>
|
||||
{sec.toString().padStart(2, '0')} / 30 s
|
||||
</Text>
|
||||
<View style={styles.progressBar}>
|
||||
<View style={[styles.progressFill, {width: `${progress * 100}%`, backgroundColor: recording ? '#FF3B30' : '#0096FF'}]} />
|
||||
</View>
|
||||
</View>
|
||||
|
||||
{/* Aufnahme-Button */}
|
||||
{!recordingPath && (
|
||||
<TouchableOpacity
|
||||
style={[styles.recordBtn, recording && styles.recordBtnActive]}
|
||||
onPress={recording ? stopRecording : startRecording}
|
||||
>
|
||||
<Text style={styles.recordIcon}>{recording ? '\u25A0' : '\u25CF'}</Text>
|
||||
<Text style={styles.recordLabel}>{recording ? 'Stop' : 'Aufnahme starten'}</Text>
|
||||
</TouchableOpacity>
|
||||
)}
|
||||
|
||||
{/* Nach Aufnahme: Name + Upload */}
|
||||
{recordingPath && (
|
||||
<View style={{marginTop: 20}}>
|
||||
<Text style={styles.hint}>
|
||||
Aufnahme ({sec}s) fertig. Vergib einen Namen und lade hoch.
|
||||
</Text>
|
||||
<TextInput
|
||||
style={styles.nameInput}
|
||||
value={voiceName}
|
||||
onChangeText={setVoiceName}
|
||||
placeholder="z.B. stefan"
|
||||
placeholderTextColor="#555570"
|
||||
autoCapitalize="none"
|
||||
autoCorrect={false}
|
||||
/>
|
||||
<View style={{flexDirection: 'row', gap: 8, marginTop: 12}}>
|
||||
<TouchableOpacity
|
||||
style={[styles.secondaryBtn, {flex: 1}]}
|
||||
onPress={() => { setRecordingPath(''); setDurationMs(0); }}
|
||||
>
|
||||
<Text style={styles.secondaryBtnText}>Nochmal aufnehmen</Text>
|
||||
</TouchableOpacity>
|
||||
<TouchableOpacity
|
||||
style={[styles.primaryBtn, {flex: 1}]}
|
||||
onPress={uploadVoice}
|
||||
disabled={processing}
|
||||
>
|
||||
{processing
|
||||
? <ActivityIndicator color="#fff" />
|
||||
: <Text style={styles.primaryBtnText}>Hochladen</Text>
|
||||
}
|
||||
</TouchableOpacity>
|
||||
</View>
|
||||
</View>
|
||||
)}
|
||||
|
||||
{recording && !enoughRecorded && (
|
||||
<Text style={[styles.hint, {marginTop: 12, color: '#FFD60A', textAlign: 'center'}]}>
|
||||
Bitte weiter lesen — mindestens 15 Sekunden
|
||||
</Text>
|
||||
)}
|
||||
|
||||
{recording && enoughRecorded && (
|
||||
<Text style={[styles.hint, {marginTop: 12, color: '#34C759', textAlign: 'center'}]}>
|
||||
Genug Audio fuer eine gute Clonung. Du kannst stoppen.
|
||||
</Text>
|
||||
)}
|
||||
</ScrollView>
|
||||
</View>
|
||||
</Modal>
|
||||
);
|
||||
};
|
||||
|
||||
const styles = StyleSheet.create({
|
||||
container: {
|
||||
flex: 1,
|
||||
backgroundColor: '#0D0D1A',
|
||||
},
|
||||
header: {
|
||||
flexDirection: 'row',
|
||||
alignItems: 'center',
|
||||
justifyContent: 'space-between',
|
||||
paddingHorizontal: 16,
|
||||
paddingTop: 48,
|
||||
paddingBottom: 16,
|
||||
borderBottomWidth: 1,
|
||||
borderBottomColor: '#1E1E2E',
|
||||
},
|
||||
title: {
|
||||
color: '#FFFFFF',
|
||||
fontSize: 18,
|
||||
fontWeight: '700',
|
||||
},
|
||||
closeX: {
|
||||
color: '#8888AA',
|
||||
fontSize: 24,
|
||||
paddingHorizontal: 8,
|
||||
},
|
||||
content: {
|
||||
flex: 1,
|
||||
},
|
||||
hint: {
|
||||
color: '#8888AA',
|
||||
fontSize: 13,
|
||||
lineHeight: 20,
|
||||
},
|
||||
sampleTextBox: {
|
||||
marginTop: 12,
|
||||
padding: 14,
|
||||
backgroundColor: '#12122A',
|
||||
borderRadius: 10,
|
||||
borderWidth: 1,
|
||||
borderColor: '#1E1E2E',
|
||||
},
|
||||
sampleText: {
|
||||
color: '#E0E0F0',
|
||||
fontSize: 15,
|
||||
lineHeight: 24,
|
||||
},
|
||||
timer: {
|
||||
color: '#666680',
|
||||
fontSize: 42,
|
||||
fontWeight: '700',
|
||||
fontVariant: ['tabular-nums'],
|
||||
},
|
||||
timerActive: {
|
||||
color: '#FF3B30',
|
||||
},
|
||||
progressBar: {
|
||||
marginTop: 8,
|
||||
width: '100%',
|
||||
height: 8,
|
||||
backgroundColor: '#1E1E2E',
|
||||
borderRadius: 4,
|
||||
overflow: 'hidden',
|
||||
},
|
||||
progressFill: {
|
||||
height: '100%',
|
||||
},
|
||||
recordBtn: {
|
||||
marginTop: 24,
|
||||
flexDirection: 'row',
|
||||
alignItems: 'center',
|
||||
justifyContent: 'center',
|
||||
gap: 12,
|
||||
backgroundColor: '#1E1E2E',
|
||||
borderRadius: 12,
|
||||
padding: 18,
|
||||
borderWidth: 2,
|
||||
borderColor: '#34C759',
|
||||
},
|
||||
recordBtnActive: {
|
||||
borderColor: '#FF3B30',
|
||||
backgroundColor: 'rgba(255,59,48,0.15)',
|
||||
},
|
||||
recordIcon: {
|
||||
color: '#FF3B30',
|
||||
fontSize: 24,
|
||||
fontWeight: '700',
|
||||
},
|
||||
recordLabel: {
|
||||
color: '#FFFFFF',
|
||||
fontSize: 17,
|
||||
fontWeight: '600',
|
||||
},
|
||||
nameInput: {
|
||||
marginTop: 10,
|
||||
backgroundColor: '#1E1E2E',
|
||||
borderRadius: 8,
|
||||
paddingHorizontal: 14,
|
||||
paddingVertical: 12,
|
||||
color: '#FFFFFF',
|
||||
fontSize: 15,
|
||||
borderWidth: 1,
|
||||
borderColor: '#2A2A3E',
|
||||
},
|
||||
primaryBtn: {
|
||||
backgroundColor: '#0096FF',
|
||||
borderRadius: 10,
|
||||
padding: 14,
|
||||
alignItems: 'center',
|
||||
},
|
||||
primaryBtnText: {
|
||||
color: '#FFFFFF',
|
||||
fontSize: 15,
|
||||
fontWeight: '700',
|
||||
},
|
||||
secondaryBtn: {
|
||||
backgroundColor: '#1E1E2E',
|
||||
borderRadius: 10,
|
||||
padding: 14,
|
||||
alignItems: 'center',
|
||||
borderWidth: 1,
|
||||
borderColor: '#2A2A3E',
|
||||
},
|
||||
secondaryBtnText: {
|
||||
color: '#8888AA',
|
||||
fontSize: 14,
|
||||
fontWeight: '600',
|
||||
},
|
||||
});
|
||||
|
||||
export default VoiceCloneModal;
|
||||
@@ -0,0 +1,224 @@
|
||||
/**
|
||||
* ZoomableImage — Pinch-to-Zoom + Pan fuers Vollbild-Modal.
|
||||
*
|
||||
* Reine RN-Implementation, ohne react-native-gesture-handler.
|
||||
*
|
||||
* - 2 Finger: Pinch (Zoom 1x..5x) + simultaner Pan via Focal-Punkt
|
||||
* - 1 Finger: Pan wenn schon gezoomt (>1.02x)
|
||||
* - Doppel-Tap (<300ms zw. zwei Single-Taps): Toggle 1x ↔ 2.5x
|
||||
*
|
||||
* Implementierungs-Hinweise zur alten Version (warum's nicht ging):
|
||||
* - `gestureState.numberActiveTouches` ist nicht zuverlaessig direkt
|
||||
* nach onPanResponderGrant. Wir lesen Finger-Anzahl jetzt
|
||||
* ausschliesslich aus `e.nativeEvent.touches.length`.
|
||||
* - Beim Wechsel von 2 → 1 Fingern bleib die Pinch-Referenz haengen.
|
||||
* Jetzt: bei jedem Finger-Wechsel re-snapshotten wir die Geste.
|
||||
* - Animated.Image bekommt jetzt pointerEvents="none" damit der View
|
||||
* GARANTIERT die Touches abbekommt.
|
||||
* - useNativeDriver ist bewusst AUS — sonst koennen wir setValue()
|
||||
* nicht synchron mit dem Pan-Responder zusammen nutzen.
|
||||
*/
|
||||
|
||||
import React, { useMemo, useRef } from 'react';
|
||||
import {
|
||||
Animated,
|
||||
PanResponder,
|
||||
GestureResponderEvent,
|
||||
ImageStyle,
|
||||
StyleProp,
|
||||
StyleSheet,
|
||||
View,
|
||||
} from 'react-native';
|
||||
|
||||
interface Props {
|
||||
uri: string;
|
||||
containerWidth: number;
|
||||
containerHeight: number;
|
||||
style?: StyleProp<ImageStyle>;
|
||||
}
|
||||
|
||||
const MIN_SCALE = 1;
|
||||
const MAX_SCALE = 5;
|
||||
const DOUBLE_TAP_MS = 300;
|
||||
const DOUBLE_TAP_DIST = 30; // Bewegung max. damit ein Tap als Tap gilt
|
||||
const PAN_SLOP_AT_SCALE_1 = 4; // Mikro-Movement nicht als Pan werten
|
||||
|
||||
const ZoomableImage: React.FC<Props> = ({ uri, containerWidth, containerHeight, style }) => {
|
||||
// Animated-Werte fuer die Render-Transformation
|
||||
const scale = useRef(new Animated.Value(1)).current;
|
||||
const tx = useRef(new Animated.Value(0)).current;
|
||||
const ty = useRef(new Animated.Value(0)).current;
|
||||
|
||||
// Logische Zustaende — wir lesen Animated.Value nicht zurueck (waere async)
|
||||
const view = useRef({ scale: 1, x: 0, y: 0 }).current;
|
||||
|
||||
// Geste-Snapshot: was war zu Beginn dieser Geste-Phase
|
||||
const gesture = useRef({
|
||||
fingers: 0, // aktuelle Finger-Anzahl
|
||||
startScale: 1,
|
||||
startX: 0,
|
||||
startY: 0,
|
||||
startDist: 0, // Pinch-Referenz-Distanz
|
||||
startFocalX: 0,
|
||||
startFocalY: 0,
|
||||
movedSinceTouch: 0, // fuer Tap-Erkennung
|
||||
touchStartedAt: 0,
|
||||
touchStartX: 0,
|
||||
touchStartY: 0,
|
||||
}).current;
|
||||
|
||||
// Doppel-Tap
|
||||
const lastTap = useRef({ at: 0, x: 0, y: 0 });
|
||||
|
||||
const clamp = (v: number, lo: number, hi: number) => Math.max(lo, Math.min(hi, v));
|
||||
|
||||
const applyClamped = (s: number, x: number, y: number) => {
|
||||
const ns = clamp(s, MIN_SCALE, MAX_SCALE);
|
||||
// Translation auf das verfuegbare Volumen begrenzen
|
||||
const maxX = Math.max(0, (containerWidth * ns - containerWidth) / 2);
|
||||
const maxY = Math.max(0, (containerHeight * ns - containerHeight) / 2);
|
||||
const nx = clamp(x, -maxX, maxX);
|
||||
const ny = clamp(y, -maxY, maxY);
|
||||
view.scale = ns;
|
||||
view.x = nx;
|
||||
view.y = ny;
|
||||
scale.setValue(ns);
|
||||
tx.setValue(nx);
|
||||
ty.setValue(ny);
|
||||
};
|
||||
|
||||
const distance = (touches: any[]) =>
|
||||
Math.hypot(touches[0].pageX - touches[1].pageX, touches[0].pageY - touches[1].pageY);
|
||||
|
||||
const focal = (touches: any[]) => ({
|
||||
x: (touches[0].pageX + touches[1].pageX) / 2,
|
||||
y: (touches[0].pageY + touches[1].pageY) / 2,
|
||||
});
|
||||
|
||||
// Snapshot vor jedem Phasenwechsel (1↔2 Finger) — verhindert Spruenge
|
||||
const snapshot = (touches: any[]) => {
|
||||
gesture.startScale = view.scale;
|
||||
gesture.startX = view.x;
|
||||
gesture.startY = view.y;
|
||||
if (touches.length >= 2) {
|
||||
gesture.startDist = distance(touches);
|
||||
const f = focal(touches);
|
||||
gesture.startFocalX = f.x;
|
||||
gesture.startFocalY = f.y;
|
||||
} else if (touches.length === 1) {
|
||||
gesture.startDist = 0;
|
||||
gesture.startFocalX = touches[0].pageX;
|
||||
gesture.startFocalY = touches[0].pageY;
|
||||
}
|
||||
};
|
||||
|
||||
const responder = useMemo(
|
||||
() =>
|
||||
PanResponder.create({
|
||||
onStartShouldSetPanResponder: () => true,
|
||||
onStartShouldSetPanResponderCapture: () => true,
|
||||
onMoveShouldSetPanResponder: () => true,
|
||||
onMoveShouldSetPanResponderCapture: () => true,
|
||||
|
||||
onPanResponderGrant: (e: GestureResponderEvent) => {
|
||||
const touches = e.nativeEvent.touches as any[];
|
||||
gesture.fingers = touches.length;
|
||||
gesture.movedSinceTouch = 0;
|
||||
gesture.touchStartedAt = Date.now();
|
||||
gesture.touchStartX = touches[0]?.pageX ?? 0;
|
||||
gesture.touchStartY = touches[0]?.pageY ?? 0;
|
||||
snapshot(touches);
|
||||
},
|
||||
|
||||
onPanResponderMove: (e: GestureResponderEvent, _gs) => {
|
||||
const touches = e.nativeEvent.touches as any[];
|
||||
|
||||
// Phasenwechsel? → Re-Snapshot, damit nicht gesprungen wird
|
||||
if (touches.length !== gesture.fingers) {
|
||||
gesture.fingers = touches.length;
|
||||
snapshot(touches);
|
||||
return;
|
||||
}
|
||||
|
||||
gesture.movedSinceTouch += 1;
|
||||
|
||||
if (touches.length >= 2) {
|
||||
// Pinch + Pan via Focal
|
||||
const d = distance(touches);
|
||||
if (gesture.startDist === 0) {
|
||||
// Sicherheitsnetz falls Snapshot gemissed wurde
|
||||
snapshot(touches);
|
||||
return;
|
||||
}
|
||||
const factor = d / gesture.startDist;
|
||||
const f = focal(touches);
|
||||
const newScale = clamp(gesture.startScale * factor, MIN_SCALE, MAX_SCALE);
|
||||
// Focal-basierter Pan: zoomt um den Mittelpunkt der zwei Finger
|
||||
const newX = gesture.startX + (f.x - gesture.startFocalX);
|
||||
const newY = gesture.startY + (f.y - gesture.startFocalY);
|
||||
applyClamped(newScale, newX, newY);
|
||||
} else if (touches.length === 1 && view.scale > 1.02) {
|
||||
const dx = touches[0].pageX - gesture.startFocalX;
|
||||
const dy = touches[0].pageY - gesture.startFocalY;
|
||||
if (Math.abs(dx) < PAN_SLOP_AT_SCALE_1 && Math.abs(dy) < PAN_SLOP_AT_SCALE_1) return;
|
||||
applyClamped(view.scale, gesture.startX + dx, gesture.startY + dy);
|
||||
}
|
||||
},
|
||||
|
||||
onPanResponderRelease: (e: GestureResponderEvent) => {
|
||||
const elapsed = Date.now() - gesture.touchStartedAt;
|
||||
const dx = (e.nativeEvent.changedTouches?.[0]?.pageX ?? gesture.touchStartX) - gesture.touchStartX;
|
||||
const dy = (e.nativeEvent.changedTouches?.[0]?.pageY ?? gesture.touchStartY) - gesture.touchStartY;
|
||||
const wasTap =
|
||||
elapsed < 280 &&
|
||||
Math.abs(dx) < DOUBLE_TAP_DIST &&
|
||||
Math.abs(dy) < DOUBLE_TAP_DIST;
|
||||
if (wasTap) {
|
||||
const now = Date.now();
|
||||
if (now - lastTap.current.at < DOUBLE_TAP_MS) {
|
||||
// Doppel-Tap → Zoom-Toggle
|
||||
if (view.scale > 1.1) {
|
||||
applyClamped(1, 0, 0);
|
||||
} else {
|
||||
applyClamped(2.5, 0, 0);
|
||||
}
|
||||
lastTap.current = { at: 0, x: 0, y: 0 };
|
||||
} else {
|
||||
lastTap.current = { at: now, x: gesture.touchStartX, y: gesture.touchStartY };
|
||||
}
|
||||
}
|
||||
gesture.fingers = 0;
|
||||
gesture.startDist = 0;
|
||||
},
|
||||
|
||||
onPanResponderTerminate: () => {
|
||||
gesture.fingers = 0;
|
||||
gesture.startDist = 0;
|
||||
},
|
||||
}),
|
||||
[],
|
||||
);
|
||||
|
||||
return (
|
||||
<View
|
||||
style={StyleSheet.absoluteFill}
|
||||
collapsable={false}
|
||||
{...responder.panHandlers}
|
||||
>
|
||||
<Animated.View pointerEvents="none" style={StyleSheet.absoluteFill}>
|
||||
<Animated.Image
|
||||
source={{ uri }}
|
||||
style={[
|
||||
style,
|
||||
{
|
||||
transform: [{ translateX: tx }, { translateY: ty }, { scale }],
|
||||
},
|
||||
]}
|
||||
resizeMode="contain"
|
||||
/>
|
||||
</Animated.View>
|
||||
</View>
|
||||
);
|
||||
};
|
||||
|
||||
export default ZoomableImage;
|
||||
File diff suppressed because it is too large
Load Diff
+1158
-114
File diff suppressed because it is too large
Load Diff
+1024
-15
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,76 @@
|
||||
/**
|
||||
* Background-Audio: ARIAs TTS, Mic-Aufnahme und Wake-Word-Lauschen sollen
|
||||
* auch bei minimierter App weiterlaufen. Wir starten dafuer einen Foreground-
|
||||
* Service mit foregroundServiceType=mediaPlayback|microphone, der eine
|
||||
* persistente Notification zeigt waehrend irgendein Audio-Slot aktiv ist.
|
||||
*
|
||||
* Mehrere Komponenten koennen den Service unabhaengig "halten":
|
||||
* - 'tts' : ARIA spricht
|
||||
* - 'rec' : Aufnahme laeuft
|
||||
* - 'wake' : Wake-Word lauscht passiv (Ohr aktiv)
|
||||
*
|
||||
* Solange mindestens ein Slot aktiv ist, laeuft der Service. Wenn alle
|
||||
* Slots leer sind, wird er gestoppt. Der Notification-Text passt sich an
|
||||
* den hoechstprioren Slot an (tts > rec > wake).
|
||||
*/
|
||||
|
||||
import { NativeModules } from 'react-native';
|
||||
|
||||
interface BackgroundAudioNative {
|
||||
start(reason: string): Promise<boolean>;
|
||||
stop(): Promise<boolean>;
|
||||
}
|
||||
|
||||
const { BackgroundAudio } = NativeModules as { BackgroundAudio?: BackgroundAudioNative };
|
||||
|
||||
type Slot = 'tts' | 'rec' | 'wake';
|
||||
|
||||
const slots = new Set<Slot>();
|
||||
|
||||
// Prioritaet fuer den Notification-Text — hoechste zuerst.
|
||||
const PRIORITY: Slot[] = ['tts', 'rec', 'wake'];
|
||||
|
||||
function topReason(): string {
|
||||
for (const s of PRIORITY) {
|
||||
if (slots.has(s)) return s;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
async function applyState(): Promise<void> {
|
||||
if (!BackgroundAudio) return;
|
||||
if (slots.size === 0) {
|
||||
try { await BackgroundAudio.stop(); } catch {}
|
||||
console.log('[BackgroundAudio] Service gestoppt (keine Slots)');
|
||||
return;
|
||||
}
|
||||
const reason = topReason();
|
||||
try {
|
||||
await BackgroundAudio.start(reason);
|
||||
console.log('[BackgroundAudio] Service aktiv (slot=%s, slots=%s)',
|
||||
reason, [...slots].join('+'));
|
||||
} catch (err: any) {
|
||||
console.warn('[BackgroundAudio] start fehlgeschlagen:', err?.message || err);
|
||||
}
|
||||
}
|
||||
|
||||
export async function acquireBackgroundAudio(slot: Slot): Promise<void> {
|
||||
if (slots.has(slot)) return;
|
||||
slots.add(slot);
|
||||
await applyState();
|
||||
}
|
||||
|
||||
export async function releaseBackgroundAudio(slot: Slot): Promise<void> {
|
||||
if (!slots.has(slot)) return;
|
||||
slots.delete(slot);
|
||||
await applyState();
|
||||
}
|
||||
|
||||
export function backgroundAudioActive(): boolean {
|
||||
return slots.size > 0;
|
||||
}
|
||||
|
||||
// --- Legacy API (nur tts-Slot) — fuer Aufruf-Sites die noch nichts vom Slot-
|
||||
// system wissen. Mappt auf den 'tts'-Slot. ---
|
||||
export const startBackgroundAudio = () => acquireBackgroundAudio('tts');
|
||||
export const stopBackgroundAudio = () => releaseBackgroundAudio('tts');
|
||||
@@ -0,0 +1,41 @@
|
||||
/**
|
||||
* Verbose-Logging-Toggle: console.log laesst sich global stummschalten.
|
||||
* console.warn/console.error bleiben immer an — Fehler will man immer sehen.
|
||||
*
|
||||
* Default: an (true). Toggle ueber Settings → Protokoll → Verbose Logging.
|
||||
* Beim Start wird der gespeicherte Wert geladen, vorher loggen wir normal.
|
||||
*/
|
||||
|
||||
import AsyncStorage from '@react-native-async-storage/async-storage';
|
||||
|
||||
export const VERBOSE_LOGGING_KEY = 'aria_verbose_logging';
|
||||
|
||||
// Original-console.log retten, damit wir die Wrapper jederzeit wieder
|
||||
// "scharf" stellen koennen (sonst waere ein Toggle-an nach -aus tot).
|
||||
const originalLog = console.log.bind(console);
|
||||
const noop = () => {};
|
||||
|
||||
let _verbose = true;
|
||||
|
||||
function applyState(): void {
|
||||
console.log = _verbose ? originalLog : noop;
|
||||
}
|
||||
|
||||
/** Wert aus AsyncStorage laden und anwenden. Beim App-Start aufrufen. */
|
||||
export async function initLogger(): Promise<void> {
|
||||
try {
|
||||
const v = await AsyncStorage.getItem(VERBOSE_LOGGING_KEY);
|
||||
_verbose = v !== 'false'; // default: true
|
||||
} catch {}
|
||||
applyState();
|
||||
}
|
||||
|
||||
export function isVerboseLogging(): boolean {
|
||||
return _verbose;
|
||||
}
|
||||
|
||||
export function setVerboseLogging(verbose: boolean): void {
|
||||
_verbose = verbose;
|
||||
applyState();
|
||||
AsyncStorage.setItem(VERBOSE_LOGGING_KEY, String(verbose)).catch(() => {});
|
||||
}
|
||||
@@ -0,0 +1,222 @@
|
||||
/**
|
||||
* PhoneCall-Service — pausiert ARIA bei Telefonaten:
|
||||
*
|
||||
* 1. Klassischer Mobilfunk-Anruf via TelephonyManager (PhoneCallModule.kt)
|
||||
* Status: idle / ringing / offhook
|
||||
*
|
||||
* 2. VoIP-Anrufe (WhatsApp, Signal, Discord, Telegram, Teams, ...) via
|
||||
* AudioFocus-Loss-Event (AudioFocusModule.kt). Diese Apps requestn
|
||||
* AUDIOFOCUS_GAIN_TRANSIENT_EXCLUSIVE wenn ein Anruf reinkommt — wir
|
||||
* bekommen ein "loss" Event und reagieren genauso wie auf RINGING.
|
||||
*
|
||||
* In beiden Faellen wird audioService.haltAllPlayback() + wakeWordService.
|
||||
* pauseForCall() gerufen. Bei call-end (idle / focus-gain) → resumeFromCall.
|
||||
*
|
||||
* Permission READ_PHONE_STATE ist nur fuer Pfad 1 noetig — Pfad 2 braucht
|
||||
* keine extra Berechtigung weil unser eigener AudioFocus-Listener feuert.
|
||||
*/
|
||||
|
||||
import {
|
||||
NativeEventEmitter,
|
||||
NativeModules,
|
||||
PermissionsAndroid,
|
||||
Platform,
|
||||
ToastAndroid,
|
||||
} from 'react-native';
|
||||
import audioService from './audio';
|
||||
import wakeWordService from './wakeword';
|
||||
|
||||
interface PhoneCallNative {
|
||||
start(): Promise<boolean>;
|
||||
stop(): Promise<boolean>;
|
||||
}
|
||||
|
||||
const { PhoneCall } = NativeModules as { PhoneCall?: PhoneCallNative };
|
||||
|
||||
type PhoneState = 'idle' | 'ringing' | 'offhook';
|
||||
|
||||
class PhoneCallService {
|
||||
private started: boolean = false;
|
||||
private subscription: { remove: () => void } | null = null;
|
||||
private focusSubscription: { remove: () => void } | null = null;
|
||||
private lastState: PhoneState = 'idle';
|
||||
/** Damit Resume nach VoIP-Loss nicht doppelt feuert wenn auch
|
||||
* TelephonyManager-IDLE-Event kommt. */
|
||||
private interruptedByFocus: boolean = false;
|
||||
|
||||
async start(): Promise<boolean> {
|
||||
if (this.started || Platform.OS !== 'android') return false;
|
||||
|
||||
// 1. AudioFocus-Listener IMMER registrieren — fangs VoIP-Calls (WhatsApp,
|
||||
// Signal, Discord etc.) abdecken, brauchen keine Permission.
|
||||
try {
|
||||
const focusEmitter = new NativeEventEmitter(NativeModules.AudioFocus as any);
|
||||
this.focusSubscription = focusEmitter.addListener(
|
||||
'AudioFocusChanged',
|
||||
(e: { type: 'loss' | 'loss_transient' | 'gain' }) => this._onFocusChanged(e.type),
|
||||
);
|
||||
console.log('[PhoneCall] AudioFocus-Listener aktiv (fuer VoIP-Calls)');
|
||||
} catch (err: any) {
|
||||
console.warn('[PhoneCall] AudioFocus-Subscription gescheitert', err?.message || err);
|
||||
}
|
||||
|
||||
// 2. TelephonyManager-Listener — fuer klassische Mobilfunk-Anrufe
|
||||
if (PhoneCall) {
|
||||
try {
|
||||
const granted = await PermissionsAndroid.request(
|
||||
PermissionsAndroid.PERMISSIONS.READ_PHONE_STATE,
|
||||
{
|
||||
title: 'ARIA Cockpit — Anruf-Erkennung',
|
||||
message: 'Damit ARIA bei einem eingehenden Anruf nicht weiterredet, '
|
||||
+ 'darf die App den Anruf-Status sehen (Klingeln/Aktiv/Aufgelegt). '
|
||||
+ 'Es werden keine Anrufdaten gelesen oder gespeichert.',
|
||||
buttonPositive: 'Erlauben',
|
||||
buttonNegative: 'Spaeter',
|
||||
},
|
||||
);
|
||||
if (granted === PermissionsAndroid.RESULTS.GRANTED) {
|
||||
const ok = await PhoneCall.start();
|
||||
if (ok) {
|
||||
const emitter = new NativeEventEmitter(NativeModules.PhoneCall as any);
|
||||
this.subscription = emitter.addListener(
|
||||
'PhoneCallStateChanged',
|
||||
(e: { state: PhoneState }) => this._onStateChanged(e.state),
|
||||
);
|
||||
console.log('[PhoneCall] TelephonyManager-Listener aktiv');
|
||||
}
|
||||
} else {
|
||||
console.warn('[PhoneCall] READ_PHONE_STATE abgelehnt — VoIP-Calls werden trotzdem ueber AudioFocus erkannt');
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.warn('[PhoneCall] TelephonyManager-Setup gescheitert:', err?.message || err);
|
||||
}
|
||||
}
|
||||
|
||||
this.started = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
async stop(): Promise<void> {
|
||||
if (!this.started) return;
|
||||
try { this.subscription?.remove(); } catch {}
|
||||
try { this.focusSubscription?.remove(); } catch {}
|
||||
this.subscription = null;
|
||||
this.focusSubscription = null;
|
||||
if (PhoneCall) {
|
||||
try { await PhoneCall.stop(); } catch {}
|
||||
}
|
||||
this.started = false;
|
||||
this.lastState = 'idle';
|
||||
this.interruptedByFocus = false;
|
||||
}
|
||||
|
||||
private _onStateChanged(state: PhoneState): void {
|
||||
if (state === this.lastState) return;
|
||||
const prev = this.lastState;
|
||||
console.log('[PhoneCall] State: %s → %s', prev, state);
|
||||
this.lastState = state;
|
||||
if (state === 'ringing' || state === 'offhook') {
|
||||
this._haltForCall(state === 'ringing' ? 'Anruf — ARIA pausiert' : 'Im Gespraech — ARIA pausiert');
|
||||
} else if (state === 'idle' && prev !== 'idle') {
|
||||
// Wenn schon durch AudioFocus-Loss pausiert wurde, NICHT doppelt resumen.
|
||||
// Der Focus-Gain-Event triggert das Resume.
|
||||
if (!this.interruptedByFocus) {
|
||||
this._resumeAfterCall('Anruf beendet — ARIA wieder aktiv');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** AudioFocus-Loss = irgendeine andere App hat den Focus uebernommen.
|
||||
* Das passiert bei VoIP-Anrufen (was wir wollen) ABER auch bei normalen
|
||||
* Audio-Playern (anderer Player startet, Notification-Sound, sogar
|
||||
* unsere eigenen Sound-Calls beim Play-Button). Daher checken wir den
|
||||
* AudioMode — nur IN_CALL (2) oder IN_COMMUNICATION (3) zaehlt als Anruf. */
|
||||
private async _onFocusChanged(type: 'loss' | 'loss_transient' | 'gain'): Promise<void> {
|
||||
if (type === 'loss' || type === 'loss_transient') {
|
||||
// Schon durch klassischen TelephonyManager pausiert? Dann nichts doppeln.
|
||||
if (this.lastState === 'ringing' || this.lastState === 'offhook') return;
|
||||
// Mode pruefen — nur echte Anrufe behandeln.
|
||||
let mode = -1;
|
||||
try { mode = await (NativeModules.AudioFocus as any)?.getMode?.(); } catch {}
|
||||
if (mode !== 2 && mode !== 3) {
|
||||
// NORMAL-Mode → kein Anruf (Stefan hat z.B. Play-Button gedrueckt
|
||||
// oder Spotify hat sich neu reingedraengelt). Keine Toasts.
|
||||
console.log('[PhoneCall] FOCUS_LOSS ignoriert (AudioMode=%d, kein Call)', mode);
|
||||
return;
|
||||
}
|
||||
this.interruptedByFocus = true;
|
||||
this._haltForCall('Anruf erkannt (VoIP) — ARIA pausiert');
|
||||
// Pollen, weil GAIN nicht zuverlaessig kommt (wir releasen den Focus
|
||||
// selbst beim halt → kein automatischer GAIN). AudioMode != IN_COMMUNICATION
|
||||
// = Call vorbei.
|
||||
this._startVoipResumePoll();
|
||||
} else if (type === 'gain') {
|
||||
if (this.interruptedByFocus) {
|
||||
this.interruptedByFocus = false;
|
||||
this._stopVoipResumePoll();
|
||||
this._resumeAfterCall('Audio frei — ARIA wieder aktiv');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Polling-Fallback: alle 3s checken ob AudioMode wieder NORMAL ist. */
|
||||
private voipPollTimer: ReturnType<typeof setInterval> | null = null;
|
||||
private _startVoipResumePoll(): void {
|
||||
if (this.voipPollTimer) return;
|
||||
this.voipPollTimer = setInterval(async () => {
|
||||
if (!this.interruptedByFocus) {
|
||||
this._stopVoipResumePoll();
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const mode = await (NativeModules.AudioFocus as any)?.getMode?.();
|
||||
// 0 = MODE_NORMAL — Call ist vorbei
|
||||
if (typeof mode === 'number' && mode === 0) {
|
||||
this.interruptedByFocus = false;
|
||||
this._stopVoipResumePoll();
|
||||
this._resumeAfterCall('Anruf beendet — ARIA wieder aktiv');
|
||||
}
|
||||
} catch {}
|
||||
}, 3000);
|
||||
}
|
||||
private _stopVoipResumePoll(): void {
|
||||
if (this.voipPollTimer) {
|
||||
clearInterval(this.voipPollTimer);
|
||||
this.voipPollTimer = null;
|
||||
}
|
||||
}
|
||||
|
||||
private _haltForCall(toast: string): void {
|
||||
// Position merken bevor wir den Stream killen — fuer Auto-Resume.
|
||||
audioService.captureInterruption();
|
||||
// pauseForCall (statt haltAllPlayback): pcmBuffer + messageId bleiben,
|
||||
// weitere Chunks werden weiter gesammelt damit isFinal die WAV schreibt.
|
||||
audioService.pauseForCall(toast);
|
||||
wakeWordService.pauseForCall().catch(() => {});
|
||||
ToastAndroid.show(toast, ToastAndroid.SHORT);
|
||||
}
|
||||
|
||||
private _resumeAfterCall(toast: string): void {
|
||||
// Anruf-Pause aufheben — neue Chunks duerfen wieder direkt abgespielt
|
||||
// werden (falls die Bridge mid-Anruf isFinal noch nicht geschickt hat).
|
||||
audioService.endCallPause();
|
||||
wakeWordService.resumeFromCall().catch(() => {});
|
||||
ToastAndroid.show(toast, ToastAndroid.SHORT);
|
||||
// 800ms warten bevor Auto-Resume — sonst kollidiert ARIA's neuer Focus-
|
||||
// Request mit Spotify's Auto-Resume nach Anruf-Ende. System haengt nach
|
||||
// dem Auflegen noch im IN_CALL-Mode-Uebergang, Spotify schaut auf Focus-
|
||||
// Gain und wuerde sofort wieder LOSS sehen → bleibt pausiert.
|
||||
// Mit Delay: Spotify resumed kurz, dann pausiert ARIA wieder ordnungs-
|
||||
// gemaess. Wenn ARIA nichts pending hat, bleibt Spotify einfach an.
|
||||
setTimeout(() => {
|
||||
audioService.resumeFromInterruption(30000).then(ok => {
|
||||
if (ok) {
|
||||
console.log('[PhoneCall] Auto-Resume von gemerkter Position gestartet');
|
||||
}
|
||||
}).catch(() => {});
|
||||
}, 800);
|
||||
}
|
||||
}
|
||||
|
||||
const phoneCallService = new PhoneCallService();
|
||||
export default phoneCallService;
|
||||
@@ -29,6 +29,11 @@ class UpdateService {
|
||||
private downloading = false;
|
||||
|
||||
constructor() {
|
||||
// Beim Start alte APK-Reste aus dem Cache wegraeumen — wenn diese App
|
||||
// laeuft, sind frueher heruntergeladene APKs entweder schon installiert
|
||||
// oder unvollstaendig gewesen. Spart sonst pro Update 20-30MB auf dem Handy.
|
||||
this.cleanupOldApks().catch(() => {});
|
||||
|
||||
// Auf update_available Nachrichten lauschen
|
||||
rvs.onMessage((msg: RVSMessage) => {
|
||||
if (msg.type === 'update_available' as any) {
|
||||
@@ -45,6 +50,71 @@ class UpdateService {
|
||||
});
|
||||
}
|
||||
|
||||
/** Sucht ueberall wo .apk-Dateien herumliegen koennten. */
|
||||
private async _apkSearchDirs(): Promise<string[]> {
|
||||
const dirs = [RNFS.CachesDirectoryPath, RNFS.DocumentDirectoryPath];
|
||||
if ((RNFS as any).ExternalCachesDirectoryPath) {
|
||||
dirs.push((RNFS as any).ExternalCachesDirectoryPath);
|
||||
}
|
||||
if (RNFS.ExternalDirectoryPath) {
|
||||
dirs.push(RNFS.ExternalDirectoryPath);
|
||||
}
|
||||
return dirs;
|
||||
}
|
||||
|
||||
/** Raeumt alte heruntergeladene APK-Dateien aus den App-Verzeichnissen auf.
|
||||
* Public damit Settings den Button "Update-Cache leeren" benutzen kann. */
|
||||
async cleanupOldApks(keepCurrentName?: string): Promise<{ removed: number; freedMB: number }> {
|
||||
const dirs = await this._apkSearchDirs();
|
||||
let removed = 0;
|
||||
let freed = 0;
|
||||
for (const dir of dirs) {
|
||||
try {
|
||||
if (!(await RNFS.exists(dir))) continue;
|
||||
const files = await RNFS.readDir(dir);
|
||||
const apks = files.filter(f => /\.apk$/i.test(f.name));
|
||||
for (const f of apks) {
|
||||
if (keepCurrentName && f.name === keepCurrentName) continue;
|
||||
try {
|
||||
const size = parseInt(f.size as any, 10) || 0;
|
||||
await RNFS.unlink(f.path);
|
||||
removed += 1;
|
||||
freed += size;
|
||||
console.log(`[Update] APK geloescht: ${f.path} (${(size / 1024 / 1024).toFixed(1)}MB)`);
|
||||
} catch (err: any) {
|
||||
console.warn(`[Update] APK-Loeschen fehlgeschlagen: ${f.path} (${err?.message || err})`);
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.warn(`[Update] Cleanup-Fehler in ${dir}: ${err?.message || err}`);
|
||||
}
|
||||
}
|
||||
const freedMB = freed / 1024 / 1024;
|
||||
if (removed > 0) {
|
||||
console.log(`[Update] Cleanup fertig: ${removed} APK${removed === 1 ? '' : 's'} entfernt, ${freedMB.toFixed(1)}MB freigegeben`);
|
||||
}
|
||||
return { removed, freedMB };
|
||||
}
|
||||
|
||||
/** Aktuelle Groesse aller APK-Dateien in den App-Verzeichnissen (in MB). */
|
||||
async getApkCacheSize(): Promise<{ count: number; totalMB: number }> {
|
||||
const dirs = await this._apkSearchDirs();
|
||||
let count = 0;
|
||||
let total = 0;
|
||||
for (const dir of dirs) {
|
||||
try {
|
||||
if (!(await RNFS.exists(dir))) continue;
|
||||
const files = await RNFS.readDir(dir);
|
||||
for (const f of files) {
|
||||
if (!f.isFile() || !/\.apk$/i.test(f.name)) continue;
|
||||
count += 1;
|
||||
total += parseInt(f.size as any, 10) || 0;
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
return { count, totalMB: total / 1024 / 1024 };
|
||||
}
|
||||
|
||||
/** Bei App-Start Update pruefen */
|
||||
checkForUpdate(): void {
|
||||
if (this.checking) return;
|
||||
@@ -111,6 +181,10 @@ class UpdateService {
|
||||
});
|
||||
});
|
||||
|
||||
// Vor dem Schreiben alte APKs im Cache wegraeumen — falls mehrere
|
||||
// Updates in einer Session gezogen werden
|
||||
await this.cleanupOldApks();
|
||||
|
||||
// Base64 als APK-Datei speichern
|
||||
const destPath = `${RNFS.CachesDirectoryPath}/${apkData.fileName}`;
|
||||
await RNFS.writeFile(destPath, apkData.base64, 'base64');
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
/**
|
||||
* Spielt einen kurzen "Bereit"-Sound (Airplane Ding-Dong) wenn das Mikrofon
|
||||
* nach Wake-Word-Erkennung wirklich offen ist. Datei liegt in
|
||||
* android/app/src/main/res/raw/wake_ready_sound.mp3 — wird ueber Android's
|
||||
* Resource-System per react-native-sound abgespielt.
|
||||
*
|
||||
* Toggle: AsyncStorage-Key 'aria_wake_ready_sound_enabled' (default true).
|
||||
*/
|
||||
|
||||
import Sound from 'react-native-sound';
|
||||
import AsyncStorage from '@react-native-async-storage/async-storage';
|
||||
|
||||
export const WAKE_READY_SOUND_STORAGE_KEY = 'aria_wake_ready_sound_enabled';
|
||||
|
||||
Sound.setCategory('Playback', false);
|
||||
|
||||
let cachedSound: Sound | null = null;
|
||||
let cachedFailed = false;
|
||||
|
||||
function getSound(): Promise<Sound | null> {
|
||||
if (cachedFailed) return Promise.resolve(null);
|
||||
if (cachedSound) return Promise.resolve(cachedSound);
|
||||
return new Promise(resolve => {
|
||||
const s = new Sound('wake_ready_sound', Sound.MAIN_BUNDLE, (err) => {
|
||||
if (err) {
|
||||
console.warn('[WakeReadySound] Konnte nicht geladen werden:', err);
|
||||
cachedFailed = true;
|
||||
resolve(null);
|
||||
return;
|
||||
}
|
||||
cachedSound = s;
|
||||
resolve(s);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/** True wenn der User den "Bereit"-Sound aktiviert hat. Default: true. */
|
||||
export async function isWakeReadySoundEnabled(): Promise<boolean> {
|
||||
try {
|
||||
const raw = await AsyncStorage.getItem(WAKE_READY_SOUND_STORAGE_KEY);
|
||||
if (raw === null) return true; // Default an
|
||||
return raw === 'true';
|
||||
} catch {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
export async function setWakeReadySoundEnabled(enabled: boolean): Promise<void> {
|
||||
try {
|
||||
await AsyncStorage.setItem(WAKE_READY_SOUND_STORAGE_KEY, String(enabled));
|
||||
} catch {}
|
||||
}
|
||||
|
||||
/** Spielt den Bereit-Sound einmal ab — non-blocking. Wenn der User ihn
|
||||
* in den Settings deaktiviert hat oder die Datei nicht ladbar ist,
|
||||
* passiert einfach nichts. */
|
||||
export async function playWakeReadySound(): Promise<void> {
|
||||
if (!(await isWakeReadySoundEnabled())) return;
|
||||
const s = await getSound();
|
||||
if (!s) return;
|
||||
try {
|
||||
s.stop(() => {
|
||||
s.setCurrentTime(0);
|
||||
s.play((success) => {
|
||||
if (!success) console.warn('[WakeReadySound] Wiedergabe fehlgeschlagen');
|
||||
});
|
||||
});
|
||||
} catch (e) {
|
||||
console.warn('[WakeReadySound] play() Exception:', e);
|
||||
}
|
||||
}
|
||||
@@ -1,56 +1,372 @@
|
||||
/**
|
||||
* Gespraechsmodus — "Ohr-Button"
|
||||
* Gespraechsmodus / Wake Word Service
|
||||
*
|
||||
* Wenn aktiv: Nach jeder ARIA-Antwort (TTS fertig) startet automatisch die Aufnahme.
|
||||
* Wie ein Walkie-Talkie / natuerliches Gespraech:
|
||||
* ARIA spricht → Aufnahme startet → User spricht → VAD stoppt → ARIA antwortet → ...
|
||||
* Wake-Word-Engine: openWakeWord (https://github.com/dscripka/openWakeWord),
|
||||
* komplett on-device via ONNX Runtime in Native-Kotlin (siehe
|
||||
* OpenWakeWordModule.kt + assets/openwakeword/). Kein API-Key, kein Cloud-
|
||||
* Roundtrip, kein Cent Lizenzgebuehren.
|
||||
*
|
||||
* Phase 2 (geplant): Porcupine "ARIA" Wake Word fuer passives Lauschen.
|
||||
* Drei Zustaende:
|
||||
* off — Ohr aus, nichts laeuft
|
||||
* armed — Ohr aktiv, openWakeWord hoert passiv auf das Wake-Word.
|
||||
* Das Mikro ist von OpenWakeWord belegt; AudioRecorder ist aus.
|
||||
* conversing — Wake-Word getriggert (oder Ohr-Tap manuell):
|
||||
* aktive Konversation. OpenWakeWord pausiert (gibt Mikro frei),
|
||||
* AudioRecorder uebernimmt fuer die Aufnahme.
|
||||
* Nach jeder ARIA-Antwort oeffnet das Mikro fuer X Sekunden
|
||||
* (Conversation-Window). Stille im Fenster → zurueck zu armed.
|
||||
*
|
||||
* Faellt das Native-Modul aus (alte App-Version, ONNX-Init-Fehler), geht
|
||||
* 'start' direkt in 'conversing' (klassischer Direkt-Aufnahme-Modus).
|
||||
*/
|
||||
|
||||
import { NativeEventEmitter, NativeModules, ToastAndroid } from 'react-native';
|
||||
import AsyncStorage from '@react-native-async-storage/async-storage';
|
||||
import { acquireBackgroundAudio } from './backgroundAudio';
|
||||
|
||||
type WakeWordCallback = () => void;
|
||||
type StateCallback = (state: WakeWordState) => void;
|
||||
|
||||
export type WakeWordState = 'off' | 'listening' | 'detected';
|
||||
export type WakeWordState = 'off' | 'armed' | 'conversing';
|
||||
|
||||
export const WAKE_KEYWORD_STORAGE = 'aria_wake_keyword';
|
||||
|
||||
/** Verfuegbare Wake-Words — entsprechen den .onnx Dateien in
|
||||
* android/app/src/main/assets/openwakeword/. Custom-Keywords (eigenes
|
||||
* Training via openwakeword Notebook) muessen aktuell als Asset eingebaut
|
||||
* werden — Diagnostic-Upload ist Phase 2. */
|
||||
export const WAKE_KEYWORDS = [
|
||||
'hey_jarvis',
|
||||
'computer',
|
||||
'alexa',
|
||||
'hey_mycroft',
|
||||
'hey_rhasspy',
|
||||
] as const;
|
||||
export type WakeKeyword = typeof WAKE_KEYWORDS[number];
|
||||
export const DEFAULT_KEYWORD: WakeKeyword = 'hey_jarvis';
|
||||
|
||||
/** Hilfs-Mapping fuer die Anzeige im UI. */
|
||||
export const KEYWORD_LABELS: Record<WakeKeyword, string> = {
|
||||
hey_jarvis: 'Hey Jarvis',
|
||||
computer: 'Computer',
|
||||
alexa: 'Alexa',
|
||||
hey_mycroft: 'Hey Mycroft',
|
||||
hey_rhasspy: 'Hey Rhasspy',
|
||||
};
|
||||
|
||||
// Detection-Tuning — kann in Settings spaeter konfigurierbar werden.
|
||||
const DEFAULT_THRESHOLD = 0.5;
|
||||
const DEFAULT_PATIENCE = 2;
|
||||
const DEFAULT_DEBOUNCE_MS = 1500;
|
||||
|
||||
interface OpenWakeWordModule {
|
||||
init(modelName: string, threshold: number, patience: number, debounceMs: number): Promise<boolean>;
|
||||
start(): Promise<boolean>;
|
||||
stop(): Promise<boolean>;
|
||||
dispose(): Promise<boolean>;
|
||||
isAvailable(): Promise<boolean>;
|
||||
}
|
||||
|
||||
const { OpenWakeWord } = NativeModules as { OpenWakeWord?: OpenWakeWordModule };
|
||||
|
||||
class WakeWordService {
|
||||
private state: WakeWordState = 'off';
|
||||
private wakeCallbacks: WakeWordCallback[] = [];
|
||||
private stateCallbacks: StateCallback[] = [];
|
||||
/** Barge-In-Callbacks: feuern wenn Wake-Word WAEHREND ARIA spricht erkannt
|
||||
* wird. ChatScreen reagiert mit TTS-stop + neuer Aufnahme. */
|
||||
private bargeCallbacks: WakeWordCallback[] = [];
|
||||
/** True solange Wake-Word parallel zu TTS aktiv ist. */
|
||||
private bargeListening: boolean = false;
|
||||
/** Anruf-Pause: state wird gemerkt damit nach Auflegen wiederhergestellt wird. */
|
||||
private callPaused: boolean = false;
|
||||
private preCallState: WakeWordState = 'off';
|
||||
/** Cooldown nach App-Resume: kurze Phase in der Wake-Word-Detections
|
||||
* ignoriert werden. Beim Wechsel von Background nach Vordergrund gibt's
|
||||
* oft einen Audio-Pegel-Spike (AudioFocus-Switch, AudioTrack re-route),
|
||||
* der openWakeWord faelschlich triggern kann. */
|
||||
private cooldownUntilMs: number = 0;
|
||||
|
||||
/** Gespraechsmodus starten */
|
||||
private keyword: WakeKeyword = DEFAULT_KEYWORD;
|
||||
private nativeReady: boolean = false;
|
||||
private initInProgress: Promise<boolean> | null = null;
|
||||
private eventSub: { remove: () => void } | null = null;
|
||||
|
||||
/** Beim App-Start aufrufen — laedt Settings, baut Native-Modul. */
|
||||
async loadFromStorage(): Promise<void> {
|
||||
try {
|
||||
const w = await AsyncStorage.getItem(WAKE_KEYWORD_STORAGE);
|
||||
const wt = (w || DEFAULT_KEYWORD).trim() as WakeKeyword;
|
||||
this.keyword = (WAKE_KEYWORDS as readonly string[]).includes(wt) ? wt : DEFAULT_KEYWORD;
|
||||
await this.initNative();
|
||||
} catch (err) {
|
||||
console.warn('[WakeWord] loadFromStorage', err);
|
||||
}
|
||||
}
|
||||
|
||||
/** Settings-Wechsel: anderes Wake-Word. Re-Init des Native-Moduls. */
|
||||
async configure(keyword: string): Promise<boolean> {
|
||||
const next: WakeKeyword = (WAKE_KEYWORDS as readonly string[]).includes(keyword)
|
||||
? (keyword as WakeKeyword)
|
||||
: DEFAULT_KEYWORD;
|
||||
this.keyword = next;
|
||||
await AsyncStorage.setItem(WAKE_KEYWORD_STORAGE, next);
|
||||
|
||||
// Laufende Instanz stoppen + neu initialisieren
|
||||
await this.disposeNative();
|
||||
const ok = await this.initNative();
|
||||
if (!ok) {
|
||||
ToastAndroid.show(
|
||||
`Wake-Word "${KEYWORD_LABELS[next]}" konnte nicht initialisiert werden — Logs pruefen`,
|
||||
ToastAndroid.LONG,
|
||||
);
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
private async initNative(): Promise<boolean> {
|
||||
if (!OpenWakeWord) {
|
||||
console.warn('[WakeWord] OpenWakeWord Native-Modul nicht verfuegbar — Direkt-Aufnahme-Fallback aktiv');
|
||||
this.nativeReady = false;
|
||||
return false;
|
||||
}
|
||||
if (this.initInProgress) return this.initInProgress;
|
||||
this.initInProgress = (async () => {
|
||||
try {
|
||||
await OpenWakeWord.init(this.keyword, DEFAULT_THRESHOLD, DEFAULT_PATIENCE, DEFAULT_DEBOUNCE_MS);
|
||||
// Subscribe nur einmal
|
||||
if (!this.eventSub) {
|
||||
const emitter = new NativeEventEmitter(NativeModules.OpenWakeWord);
|
||||
this.eventSub = emitter.addListener('WakeWordDetected', () => {
|
||||
console.log('[WakeWord] Native Detection-Event empfangen');
|
||||
this.onWakeDetected().catch(err =>
|
||||
console.warn('[WakeWord] onWakeDetected crashed:', err));
|
||||
});
|
||||
}
|
||||
this.nativeReady = true;
|
||||
console.log('[WakeWord] Init OK (model=%s)', this.keyword);
|
||||
return true;
|
||||
} catch (err: any) {
|
||||
console.warn('[WakeWord] Init fehlgeschlagen:', err?.message || err);
|
||||
this.nativeReady = false;
|
||||
return false;
|
||||
} finally {
|
||||
this.initInProgress = null;
|
||||
}
|
||||
})();
|
||||
return this.initInProgress;
|
||||
}
|
||||
|
||||
private async disposeNative(): Promise<void> {
|
||||
if (!OpenWakeWord) return;
|
||||
try { await OpenWakeWord.dispose(); } catch {}
|
||||
this.nativeReady = false;
|
||||
}
|
||||
|
||||
/** Ohr-Button gedrueckt — startet passives Lauschen oder direkt Konversation. */
|
||||
async start(): Promise<boolean> {
|
||||
if (this.state === 'listening') return true;
|
||||
console.log('[WakeWord] Gespraechsmodus aktiviert — starte sofort Aufnahme');
|
||||
this.setState('listening');
|
||||
// Sofort erste Aufnahme starten
|
||||
if (this.state !== 'off') return true;
|
||||
// Foreground-Service VOR dem Mic-Zugriff hochziehen damit Background-
|
||||
// Lauschen funktioniert (Android braucht foregroundServiceType=microphone
|
||||
// aktiv zum Zeitpunkt des AudioRecord.startRecording).
|
||||
await acquireBackgroundAudio('wake');
|
||||
if (this.nativeReady && OpenWakeWord) {
|
||||
try {
|
||||
await OpenWakeWord.start();
|
||||
console.log('[WakeWord] armed — warte auf "%s"', this.keyword);
|
||||
ToastAndroid.show(`Lausche auf "${KEYWORD_LABELS[this.keyword]}"`, ToastAndroid.SHORT);
|
||||
this.setState('armed');
|
||||
return true;
|
||||
} catch (err: any) {
|
||||
console.warn('[WakeWord] start fehlgeschlagen — Fallback Direkt-Aufnahme:',
|
||||
err?.message || err);
|
||||
ToastAndroid.show(
|
||||
`Wake-Word-Start failed: ${err?.message || err}`,
|
||||
ToastAndroid.LONG,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
console.warn('[WakeWord] Native-Modul nicht bereit — Direkt-Aufnahme-Fallback');
|
||||
ToastAndroid.show(
|
||||
'Wake-Word nicht aktiv — direkte Aufnahme startet (Mikro hoert mit)',
|
||||
ToastAndroid.LONG,
|
||||
);
|
||||
}
|
||||
// Fallback: direkt in Konversation
|
||||
console.log('[WakeWord] Direkt-Aufnahme startet (kein Wake-Word)');
|
||||
this.setState('conversing');
|
||||
setTimeout(() => {
|
||||
if (this.state === 'listening') {
|
||||
if (this.state === 'conversing') {
|
||||
this.wakeCallbacks.forEach(cb => cb());
|
||||
}
|
||||
}, 500);
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Gespraechsmodus stoppen */
|
||||
stop(): void {
|
||||
console.log('[WakeWord] Gespraechsmodus deaktiviert');
|
||||
/** Komplett ausschalten (Ohr abschalten) */
|
||||
async stop(): Promise<void> {
|
||||
console.log('[WakeWord] Ohr deaktiviert');
|
||||
if (this.nativeReady && OpenWakeWord) {
|
||||
try { await OpenWakeWord.stop(); } catch {}
|
||||
}
|
||||
this.bargeListening = false;
|
||||
this.setState('off');
|
||||
}
|
||||
|
||||
/** Nach ARIA-Antwort (TTS fertig): Aufnahme automatisch starten */
|
||||
/** Cooldown setzen — alle Wake-Word-Detections in den naechsten ms ignorieren.
|
||||
* Wird beim App-Resume gerufen weil AppState-Wechsel Audio-Spikes erzeugen
|
||||
* die openWakeWord faelschlich als Trigger interpretiert. */
|
||||
setResumeCooldown(ms: number = 1500): void {
|
||||
this.cooldownUntilMs = Date.now() + ms;
|
||||
console.log('[WakeWord] Cooldown aktiv fuer %dms', ms);
|
||||
}
|
||||
|
||||
/** Wake-Word getriggert: Native-Modul pausieren, Konversation starten. */
|
||||
private async onWakeDetected(): Promise<void> {
|
||||
const now = Date.now();
|
||||
if (now < this.cooldownUntilMs) {
|
||||
const left = this.cooldownUntilMs - now;
|
||||
console.log('[WakeWord] Trigger ignoriert (Cooldown noch %dms aktiv — wahrscheinlich App-Resume-Spike)', left);
|
||||
return;
|
||||
}
|
||||
console.log('[WakeWord] Wake-Word "%s" erkannt! (state=%s, barge=%s)',
|
||||
this.keyword, this.state, this.bargeListening);
|
||||
if (this.nativeReady && OpenWakeWord) {
|
||||
try { await OpenWakeWord.stop(); } catch {}
|
||||
}
|
||||
this.bargeListening = false;
|
||||
// Wenn wir bereits in 'conversing' sind und der Trigger waehrend ARIAs TTS
|
||||
// kam (Barge-In via Wake-Word), feuern wir einen separaten Callback damit
|
||||
// ChatScreen das TTS abbrechen + neue Aufnahme starten kann. Sonst normal.
|
||||
if (this.state === 'conversing') {
|
||||
this.bargeCallbacks.forEach(cb => {
|
||||
try { cb(); } catch (e) { console.warn('[WakeWord] barge cb err:', e); }
|
||||
});
|
||||
// Kein erneutes setState — wir bleiben in 'conversing'.
|
||||
return;
|
||||
}
|
||||
this.setState('conversing');
|
||||
setTimeout(() => {
|
||||
if (this.state === 'conversing') {
|
||||
this.wakeCallbacks.forEach(cb => cb());
|
||||
}
|
||||
}, 200);
|
||||
}
|
||||
|
||||
/** Wake-Word PARALLEL zur TTS-Wiedergabe lauschen lassen — User kann
|
||||
* "Computer" sagen waehrend ARIA noch redet, AcousticEchoCanceler im
|
||||
* Native-Modul verhindert dass ARIAs eigene Stimme triggert.
|
||||
* Voraussetzung: AudioRecorder muss frei sein (Recording aus). Wenn der
|
||||
* AudioRecorder gerade laeuft, hat der Vorrang — Wake-Word geht nicht. */
|
||||
async startBargeListening(): Promise<void> {
|
||||
if (!this.nativeReady || !OpenWakeWord) return;
|
||||
if (this.state !== 'conversing') return;
|
||||
if (this.bargeListening) return;
|
||||
try {
|
||||
await OpenWakeWord.start();
|
||||
this.bargeListening = true;
|
||||
console.log('[WakeWord] Barge-Listening aktiv (parallel zu TTS)');
|
||||
} catch (err) {
|
||||
console.warn('[WakeWord] Barge-Listening start fehlgeschlagen:', err);
|
||||
}
|
||||
}
|
||||
|
||||
/** Barge-Listening wieder aus — z.B. wenn der AudioRecorder fuer die
|
||||
* naechste Aufnahme das Mikro braucht. */
|
||||
async stopBargeListening(): Promise<void> {
|
||||
if (!this.bargeListening) return;
|
||||
if (this.nativeReady && OpenWakeWord) {
|
||||
try { await OpenWakeWord.stop(); } catch {}
|
||||
}
|
||||
this.bargeListening = false;
|
||||
console.log('[WakeWord] Barge-Listening aus');
|
||||
}
|
||||
|
||||
/** Bei eingehendem Anruf: Wake-Word + Aufnahme stoppen, Pre-Call-State
|
||||
* merken. Telefonie-App belegt das Mikro waehrend des Anrufs, plus ARIA
|
||||
* soll nicht in laufende Telefonate reinhoeren. */
|
||||
async pauseForCall(): Promise<void> {
|
||||
if (this.callPaused) return;
|
||||
this.preCallState = this.state;
|
||||
if (this.state === 'off') {
|
||||
this.callPaused = true; // merken dass wir pausiert wurden
|
||||
return;
|
||||
}
|
||||
this.callPaused = true;
|
||||
if (this.nativeReady && OpenWakeWord) {
|
||||
try { await OpenWakeWord.stop(); } catch {}
|
||||
}
|
||||
this.bargeListening = false;
|
||||
console.log('[WakeWord] Anruf — Wake-Word pausiert (war: %s)', this.preCallState);
|
||||
}
|
||||
|
||||
/** Nach Auflegen: Pre-Call-State wiederherstellen. Aktive Konversation
|
||||
* geht zu armed zurueck (User soll nicht in einen halben Dialog springen). */
|
||||
async resumeFromCall(): Promise<void> {
|
||||
if (!this.callPaused) return;
|
||||
const restoreTo = this.preCallState;
|
||||
this.callPaused = false;
|
||||
this.preCallState = 'off';
|
||||
console.log('[WakeWord] Anruf zu Ende — restore state=%s', restoreTo);
|
||||
if (restoreTo === 'off') return;
|
||||
// Aktive Konversation war wahrscheinlich durch haltAllPlayback eh abgebrochen,
|
||||
// sicher zu armed degraden.
|
||||
if (restoreTo === 'conversing') this.setState('armed');
|
||||
if (this.nativeReady && OpenWakeWord) {
|
||||
try { await OpenWakeWord.start(); } catch (err) {
|
||||
console.warn('[WakeWord] Restore-Start fehlgeschlagen:', err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Konversation beenden — User hat im Window nichts gesagt.
|
||||
* Mit Wake-Word: zurueck zu 'armed' (Listener wieder an).
|
||||
* Ohne: zurueck zu 'off'.
|
||||
*/
|
||||
async endConversation(): Promise<void> {
|
||||
if (this.state !== 'conversing') return;
|
||||
if (this.nativeReady && OpenWakeWord) {
|
||||
try {
|
||||
await OpenWakeWord.start();
|
||||
console.log('[WakeWord] Konversation zu Ende — zurueck zu armed');
|
||||
ToastAndroid.show(`Lausche wieder auf "${KEYWORD_LABELS[this.keyword]}"`, ToastAndroid.SHORT);
|
||||
this.setState('armed');
|
||||
return;
|
||||
} catch (err) {
|
||||
console.warn('[WakeWord] re-arm fehlgeschlagen:', err);
|
||||
}
|
||||
}
|
||||
console.log('[WakeWord] Konversation zu Ende — Ohr aus');
|
||||
ToastAndroid.show('Mikro aus', ToastAndroid.SHORT);
|
||||
this.setState('off');
|
||||
}
|
||||
|
||||
/** Nach ARIA-Antwort (TTS fertig): naechste Aufnahme im Conversation-Window starten */
|
||||
async resume(): Promise<void> {
|
||||
if (this.state !== 'listening') return;
|
||||
if (this.state !== 'conversing') return;
|
||||
// Kurze Pause damit TTS-Audio nicht ins Mikrofon geht
|
||||
await new Promise(resolve => setTimeout(resolve, 800));
|
||||
if (this.state === 'listening') {
|
||||
console.log('[WakeWord] TTS fertig — starte automatisch Aufnahme');
|
||||
if (this.state === 'conversing') {
|
||||
console.log('[WakeWord] TTS fertig — naechste Aufnahme im Conversation-Window');
|
||||
this.wakeCallbacks.forEach(cb => cb());
|
||||
}
|
||||
}
|
||||
|
||||
/** True solange das Ohr aktiv ist (armed ODER conversing). */
|
||||
isActive(): boolean {
|
||||
return this.state === 'listening';
|
||||
return this.state !== 'off';
|
||||
}
|
||||
|
||||
isConversing(): boolean {
|
||||
return this.state === 'conversing';
|
||||
}
|
||||
|
||||
hasWakeWord(): boolean {
|
||||
return this.nativeReady;
|
||||
}
|
||||
|
||||
getKeyword(): WakeKeyword {
|
||||
return this.keyword;
|
||||
}
|
||||
|
||||
// --- Callbacks ---
|
||||
@@ -62,6 +378,19 @@ class WakeWordService {
|
||||
};
|
||||
}
|
||||
|
||||
/** Subscribe auf Barge-In-Events: Wake-Word erkannt waehrend ARIA noch
|
||||
* spricht. ChatScreen sollte dann TTS abbrechen + neue Aufnahme starten. */
|
||||
onBargeIn(callback: WakeWordCallback): () => void {
|
||||
this.bargeCallbacks.push(callback);
|
||||
return () => {
|
||||
this.bargeCallbacks = this.bargeCallbacks.filter(cb => cb !== callback);
|
||||
};
|
||||
}
|
||||
|
||||
isBargeListening(): boolean {
|
||||
return this.bargeListening;
|
||||
}
|
||||
|
||||
onStateChange(callback: StateCallback): () => void {
|
||||
this.stateCallbacks.push(callback);
|
||||
return () => {
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
# ════════════════════════════════════════════════════════════
|
||||
# ARIA Brain — Agent + Memory Container
|
||||
#
|
||||
# FastAPI-Server mit Vector-DB-Memory (Qdrant).
|
||||
# Spricht via HTTP/WebSocket mit Bridge und Diagnostic.
|
||||
# LLM-Calls gehen ueber den Proxy (claude-max-api-proxy).
|
||||
# ════════════════════════════════════════════════════════════
|
||||
|
||||
FROM python:3.12-slim
|
||||
|
||||
# System-Tools die Skills brauchen koennten (curl, jq, git, ssh-client,
|
||||
# Build-Basics fuer venv-Compiles). Bewusst sparsam — alles weitere
|
||||
# bringt der Skill selbst mit (siehe execution=local-bin).
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
jq \
|
||||
git \
|
||||
openssh-client \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
# Embedding-Model-Cache und Skills landen unter /data (Volume)
|
||||
ENV SENTENCE_TRANSFORMERS_HOME=/data/_models
|
||||
ENV ARIA_DATA_DIR=/data
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
|
||||
@@ -0,0 +1,385 @@
|
||||
"""
|
||||
Conversation-Loop. Eine Anfrage von Stefan, eine Antwort von ARIA.
|
||||
|
||||
Pro Turn:
|
||||
1. user-Turn an die laufende Conversation appenden
|
||||
2. Hot Memory holen (alle pinned Punkte)
|
||||
3. Cold Memory holen (Top-K semantisch zur user-Nachricht)
|
||||
4. System-Prompt aus Hot+Cold bauen
|
||||
5. Messages = [system, *window, user]
|
||||
6. Claude via Proxy aufrufen
|
||||
7. Assistant-Reply in Conversation appenden + zurueckgeben
|
||||
|
||||
Memory-Destillat laeuft asynchron NACH dem Reply, gesteuert vom
|
||||
/chat-Endpoint ueber BackgroundTasks.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from conversation import Conversation, Turn
|
||||
from memory import Embedder, VectorStore, MemoryPoint
|
||||
from prompts import build_system_prompt
|
||||
from proxy_client import ProxyClient, Message as ProxyMessage
|
||||
import skills as skills_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Meta-Tool: ARIA kann selbst neue Skills bauen
|
||||
META_TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "skill_create",
|
||||
"description": (
|
||||
"Erstelle einen neuen Skill (wiederverwendbare Faehigkeit). "
|
||||
"Skills sind IMMER Python — jeder Skill bekommt seine eigene venv "
|
||||
"mit den pip_packages die er braucht.\n\n"
|
||||
"HARTE REGEL — IMMER Skill anlegen wenn: die Loesung erfordert eine "
|
||||
"pip-Library. Sonst muesste der Install bei jedem Container-Restart "
|
||||
"neu laufen (Brain hat keinen persistenten State ausser /data/skills/).\n\n"
|
||||
"Sonst NUR wenn ALLE Kriterien erfuellt sind:\n"
|
||||
" 1) wiederkehrend (Aufgabe kommt realistisch nochmal),\n"
|
||||
" 2) nicht-trivial (mehrere Schritte),\n"
|
||||
" 3) parametrisierbar (nimmt Eingaben, gibt Ergebnis),\n"
|
||||
" 4) wiederverwendbar als ganzes Paket.\n"
|
||||
"NICHT fuer einzelne Shell-Befehle (date, hostname, ls etc.) und "
|
||||
"nicht fuer Einmal-Faelle. Stefan kann Skill-Erstellung explizit "
|
||||
"triggern (\"bau daraus einen Skill\").\n\n"
|
||||
"Wenn etwas nur via apt-Paket geht — Stefan fragen ob es ins "
|
||||
"Brain-Dockerfile soll, NICHT als Skill bauen."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string", "description": "kurz, kebab-case, a-z 0-9 - _"},
|
||||
"description": {"type": "string", "description": "Was kann der Skill? 1 Satz."},
|
||||
"entry_code": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Python-Code. Args lesen via os.environ['ARG_NAME']. "
|
||||
"Resultat per print() (stdout) zurueck. Bei Fehler: "
|
||||
"non-zero exit (sys.exit(1) o.ae.)."
|
||||
),
|
||||
},
|
||||
"readme": {"type": "string", "description": "Markdown — was macht der Skill, Beispiel-Aufrufe"},
|
||||
"pip_packages": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "pip-Pakete die in der venv installiert werden (z.B. requests, yt-dlp, pypdf)",
|
||||
},
|
||||
"args": {
|
||||
"type": "array",
|
||||
"items": {"type": "object"},
|
||||
"description": "Argumente-Schema [{name, type, required, description}]",
|
||||
},
|
||||
},
|
||||
"required": ["name", "description", "entry_code"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "skill_list",
|
||||
"description": "Zeigt alle Skills (inkl. deaktivierte). Sollte selten noetig sein — die Liste steht eh im System-Prompt.",
|
||||
"parameters": {"type": "object", "properties": {}},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def _skill_to_tool(s: dict) -> dict:
|
||||
"""Mappt einen Skill auf ein OpenAI-Function-Tool."""
|
||||
args = s.get("args") or []
|
||||
props = {}
|
||||
required = []
|
||||
for a in args:
|
||||
if not isinstance(a, dict):
|
||||
continue
|
||||
name = a.get("name") or ""
|
||||
if not name:
|
||||
continue
|
||||
props[name] = {
|
||||
"type": a.get("type", "string"),
|
||||
"description": a.get("description", ""),
|
||||
}
|
||||
if a.get("required"):
|
||||
required.append(name)
|
||||
return {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": f"run_{s['name']}",
|
||||
"description": s.get("description", "(ohne Beschreibung)"),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": props,
|
||||
"required": required,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class Agent:
|
||||
def __init__(self, store: VectorStore, embedder: Embedder,
|
||||
conversation: Conversation, proxy: ProxyClient,
|
||||
cold_k: int = 5):
|
||||
self.store = store
|
||||
self.embedder = embedder
|
||||
self.conversation = conversation
|
||||
self.proxy = proxy
|
||||
self.cold_k = cold_k
|
||||
# Side-Channel-Events die im Turn entstehen (z.B. skill_create).
|
||||
# Werden vom /chat-Endpoint in der Response mitgeschickt, damit
|
||||
# Stefan in der App und Diagnostic eine sichtbare Bubble bekommt.
|
||||
self._pending_events: list[dict] = []
|
||||
|
||||
def pop_events(self) -> list[dict]:
|
||||
"""Holt die Events des letzten chat()-Calls und leert die Liste."""
|
||||
events = self._pending_events
|
||||
self._pending_events = []
|
||||
return events
|
||||
|
||||
# ── Hauptpfad: ein User-Turn → Tool-Loop → finaler Reply ──
|
||||
|
||||
MAX_TOOL_ITERATIONS = 8 # Schutz vor Endlos-Loops
|
||||
|
||||
def chat(self, user_message: str, source: str = "") -> str:
|
||||
user_message = (user_message or "").strip()
|
||||
if not user_message:
|
||||
raise ValueError("Leere Nachricht")
|
||||
|
||||
# Events vom letzten Turn weglassen
|
||||
self._pending_events = []
|
||||
|
||||
# 1. User-Turn an die Konversation
|
||||
self.conversation.add("user", user_message, source=source)
|
||||
|
||||
# 2. Hot Memory (alle pinned Punkte)
|
||||
hot = self.store.list_pinned()
|
||||
|
||||
# 3. Cold Memory (Top-K semantic)
|
||||
try:
|
||||
qvec = self.embedder.embed(user_message)
|
||||
cold = self.store.search(qvec, k=self.cold_k, exclude_pinned=True)
|
||||
except Exception as exc:
|
||||
logger.warning("Cold-Search fehlgeschlagen: %s", exc)
|
||||
cold = []
|
||||
|
||||
# 4. Aktive Skills holen + Tool-Liste bauen
|
||||
all_skills = skills_mod.list_skills(active_only=False)
|
||||
active_skills = [s for s in all_skills if s.get("active", True)]
|
||||
tools = list(META_TOOLS) + [_skill_to_tool(s) for s in active_skills]
|
||||
|
||||
# 5. System-Prompt + Window-Messages
|
||||
system_prompt = build_system_prompt(hot, cold, skills=all_skills)
|
||||
messages = [ProxyMessage(role="system", content=system_prompt)]
|
||||
for t in self.conversation.window():
|
||||
messages.append(ProxyMessage(role=t.role, content=t.content))
|
||||
|
||||
logger.info("chat: pinned=%d cold=%d skills=%d/%d window=%d prompt_chars=%d",
|
||||
len(hot), len(cold), len(active_skills), len(all_skills),
|
||||
len(self.conversation.window()), len(system_prompt))
|
||||
|
||||
# 6. Tool-Use-Loop
|
||||
final_reply = ""
|
||||
for iteration in range(self.MAX_TOOL_ITERATIONS):
|
||||
result = self.proxy.chat_full(messages, tools=tools)
|
||||
if result.tool_calls:
|
||||
# Assistant-Turn mit tool_calls in messages anhaengen (nicht in Conversation!)
|
||||
messages.append(ProxyMessage(
|
||||
role="assistant",
|
||||
content=result.content or None,
|
||||
tool_calls=[{
|
||||
"id": tc["id"], "type": "function",
|
||||
"function": {"name": tc["name"], "arguments": json.dumps(tc["arguments"])},
|
||||
} for tc in result.tool_calls],
|
||||
))
|
||||
# Tools ausfuehren + Ergebnis als role=tool zurueck
|
||||
for tc in result.tool_calls:
|
||||
tool_result = self._dispatch_tool(tc["name"], tc["arguments"])
|
||||
messages.append(ProxyMessage(
|
||||
role="tool",
|
||||
tool_call_id=tc["id"],
|
||||
name=tc["name"],
|
||||
content=tool_result[:8000],
|
||||
))
|
||||
continue # next iteration mit Tool-Results
|
||||
# Kein Tool-Call mehr → final reply
|
||||
final_reply = (result.content or "").strip()
|
||||
break
|
||||
else:
|
||||
# Loop-Limit erreicht
|
||||
final_reply = "[Tool-Loop-Limit erreicht — ARIA hat zu viele Tool-Calls gemacht ohne fertig zu werden]"
|
||||
logger.warning("Tool-Loop hit MAX_TOOL_ITERATIONS=%d", self.MAX_TOOL_ITERATIONS)
|
||||
|
||||
if not final_reply:
|
||||
raise RuntimeError("Leerer Reply vom Proxy")
|
||||
|
||||
# 7. Assistant-Turn (final reply) in die Conversation
|
||||
self.conversation.add("assistant", final_reply)
|
||||
return final_reply
|
||||
|
||||
# ── Tool-Dispatcher ───────────────────────────────────────
|
||||
|
||||
def _dispatch_tool(self, name: str, arguments: dict) -> str:
|
||||
"""Fuehrt einen Tool-Call aus und gibt ein kurzes Text-Resultat zurueck.
|
||||
Niemals werfen — Fehler werden als Text-Resultat reportet damit Claude
|
||||
weitermachen kann."""
|
||||
try:
|
||||
if name == "skill_create":
|
||||
# ARIA-Skills sind immer Python — execution ist nicht mehr im Schema
|
||||
manifest = skills_mod.create_skill(
|
||||
name=arguments["name"],
|
||||
description=arguments["description"],
|
||||
execution="local-venv",
|
||||
entry_code=arguments["entry_code"],
|
||||
readme=arguments.get("readme", ""),
|
||||
args=arguments.get("args", []),
|
||||
pip_packages=arguments.get("pip_packages", []),
|
||||
author="aria",
|
||||
)
|
||||
# Side-Channel-Event: Stefan soll sehen wenn ARIA was anlegt
|
||||
self._pending_events.append({
|
||||
"type": "skill_created",
|
||||
"skill": {
|
||||
"name": manifest["name"],
|
||||
"description": manifest.get("description", ""),
|
||||
"execution": manifest.get("execution", ""),
|
||||
"active": manifest.get("active", True),
|
||||
"setup_error": manifest.get("setup_error"),
|
||||
},
|
||||
})
|
||||
return f"OK — Skill '{manifest['name']}' erstellt (active={manifest['active']})."
|
||||
if name == "skill_list":
|
||||
items = skills_mod.list_skills(active_only=False)
|
||||
if not items:
|
||||
return "(keine Skills vorhanden)"
|
||||
return "\n".join(
|
||||
f"- {s['name']} ({s['execution']}) {'aktiv' if s.get('active', True) else 'DEAKTIVIERT'}: {s.get('description', '')}"
|
||||
for s in items
|
||||
)
|
||||
if name.startswith("run_"):
|
||||
skill_name = name[len("run_"):]
|
||||
res = skills_mod.run_skill(skill_name, args=arguments)
|
||||
snippet = (res.get("stdout") or "")[:2000] or "(kein stdout)"
|
||||
err = (res.get("stderr") or "")[:500]
|
||||
marker = "OK" if res["ok"] else f"FEHLER (exit={res['exit_code']})"
|
||||
out = f"{marker} · {res['duration_sec']}s\nstdout:\n{snippet}"
|
||||
if err:
|
||||
out += f"\nstderr:\n{err}"
|
||||
return out
|
||||
return f"Unbekanntes Tool: {name}"
|
||||
except Exception as exc:
|
||||
logger.exception("Tool '%s' fehlgeschlagen", name)
|
||||
return f"FEHLER: {exc}"
|
||||
|
||||
# ── Memory-Destillat (laeuft im Hintergrund) ──────────────
|
||||
|
||||
def distill_old_turns(self) -> dict:
|
||||
"""Nimmt die N aeltesten Turns und destilliert sie zu fact-Memories.
|
||||
|
||||
Pattern: separater Claude-Call, lieferte 3-7 JSON-Facts, die als
|
||||
type=fact, source=distilled gespeichert werden. Erfolgreiches
|
||||
Schreiben → Turns aus dem Window entfernen.
|
||||
"""
|
||||
if not self.conversation.needs_distill():
|
||||
return {"distilled": 0, "reason": "kein Bedarf"}
|
||||
|
||||
old_turns = self.conversation.take_oldest_for_distill()
|
||||
if not old_turns:
|
||||
return {"distilled": 0, "reason": "keine alten Turns"}
|
||||
|
||||
# Konversation als Klartext bauen
|
||||
transcript = "\n".join(
|
||||
f"[{t.role.upper()}] {t.content}" for t in old_turns
|
||||
)[:30000] # Cap auf 30k Zeichen damit der Prompt nicht explodiert
|
||||
|
||||
system = (
|
||||
"Du extrahierst aus einer Konversation zwischen Stefan und ARIA die "
|
||||
"wichtigsten dauerhaft relevanten Fakten — keine Smalltalk-Details, "
|
||||
"keine flüchtigen Zustände. Antworte AUSSCHLIESSLICH mit gültigem JSON "
|
||||
"im Format: {\"facts\": [{\"title\": \"kurz, max 80 Zeichen\", "
|
||||
"\"content\": \"1-3 Sätze, konkret und nützlich\"}]}. "
|
||||
"Mindestens 0, höchstens 7 Facts. Wenn nichts wichtig genug ist: leeres Array."
|
||||
)
|
||||
user = (
|
||||
"Hier ist der Konversations-Abschnitt:\n\n"
|
||||
f"{transcript}\n\n"
|
||||
"Extrahiere die wichtigsten Fakten als JSON."
|
||||
)
|
||||
|
||||
try:
|
||||
raw = self.proxy.chat([
|
||||
ProxyMessage(role="system", content=system),
|
||||
ProxyMessage(role="user", content=user),
|
||||
])
|
||||
except Exception as exc:
|
||||
logger.warning("Destillat-Call fehlgeschlagen: %s — Turns bleiben", exc)
|
||||
return {"distilled": 0, "error": str(exc)}
|
||||
|
||||
facts = self._parse_facts(raw)
|
||||
if facts is None:
|
||||
logger.warning("Destillat lieferte unparsbares JSON: %r", raw[:200])
|
||||
return {"distilled": 0, "error": "JSON parse failed", "raw": raw[:200]}
|
||||
|
||||
# Facts in die DB schreiben
|
||||
created = 0
|
||||
for f in facts:
|
||||
content = (f.get("content") or "").strip()
|
||||
if not content:
|
||||
continue
|
||||
title = (f.get("title") or "").strip()[:120] or "Fakt"
|
||||
point = MemoryPoint(
|
||||
id="",
|
||||
type="fact",
|
||||
title=title,
|
||||
content=content,
|
||||
pinned=False,
|
||||
category="konversation",
|
||||
source="distilled",
|
||||
tags=[],
|
||||
)
|
||||
try:
|
||||
vec = self.embedder.embed(content)
|
||||
self.store.upsert(point, vec)
|
||||
created += 1
|
||||
except Exception as exc:
|
||||
logger.warning("Fakt schreiben fehlgeschlagen: %s", exc)
|
||||
|
||||
# Erst nach erfolgreichem Schreiben aus dem Window entfernen
|
||||
last_ts = old_turns[-1].ts
|
||||
self.conversation.commit_distill(last_ts)
|
||||
logger.info("Destillat: %d Facts geschrieben, %d Turns aus Window entfernt",
|
||||
created, len(old_turns))
|
||||
return {"distilled": created, "removed_turns": len(old_turns)}
|
||||
|
||||
@staticmethod
|
||||
def _parse_facts(raw: str) -> Optional[list]:
|
||||
if not raw:
|
||||
return None
|
||||
# JSON robust extrahieren — Claude kann Code-Fences setzen
|
||||
cleaned = raw.strip()
|
||||
if cleaned.startswith("```"):
|
||||
# ```json oder ``` rauswerfen
|
||||
cleaned = cleaned.split("\n", 1)[1] if "\n" in cleaned else cleaned[3:]
|
||||
if cleaned.endswith("```"):
|
||||
cleaned = cleaned[: -3]
|
||||
cleaned = cleaned.strip()
|
||||
# Erstes { bis letztes }
|
||||
start = cleaned.find("{")
|
||||
end = cleaned.rfind("}")
|
||||
if start == -1 or end == -1 or end < start:
|
||||
return None
|
||||
try:
|
||||
obj = json.loads(cleaned[start: end + 1])
|
||||
except Exception:
|
||||
return None
|
||||
facts = obj.get("facts") if isinstance(obj, dict) else None
|
||||
if not isinstance(facts, list):
|
||||
return None
|
||||
return facts
|
||||
@@ -0,0 +1,130 @@
|
||||
"""
|
||||
Conversation-State — ein einziger Rolling-Window-State fuer ARIAs
|
||||
laufendes Gespraech mit Stefan.
|
||||
|
||||
Stefan-Entscheidung: KEINE Sessions, KEIN Multi-Thread. EIN Strang,
|
||||
intern rollend. Was rausfaellt, wird ggf. destilliert und landet
|
||||
als type=fact Memory in der Vector-DB.
|
||||
|
||||
Persistenz: append-only JSONL unter /data/conversation.jsonl.
|
||||
Bei Restart wird die letzte N gelesen (komplett vermeidet Memory-
|
||||
Overhead bei sehr langen Verlaeufen).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CONVERSATION_FILE = Path(os.environ.get("CONVERSATION_FILE", "/data/conversation.jsonl"))
|
||||
|
||||
|
||||
@dataclass
|
||||
class Turn:
|
||||
role: str # "user" | "assistant"
|
||||
content: str
|
||||
ts: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
||||
source: str = "" # "app" / "diagnostic" / "stt" — optional
|
||||
|
||||
|
||||
class Conversation:
|
||||
"""In-Memory Rolling Window, mit JSONL-Persistenz."""
|
||||
|
||||
def __init__(self, max_window: int = 50, distill_threshold: int = 60,
|
||||
distill_count: int = 30):
|
||||
self.max_window = max_window
|
||||
self.distill_threshold = distill_threshold
|
||||
self.distill_count = distill_count
|
||||
self.turns: List[Turn] = []
|
||||
self._load()
|
||||
|
||||
def _load(self):
|
||||
if not CONVERSATION_FILE.exists():
|
||||
return
|
||||
try:
|
||||
lines = CONVERSATION_FILE.read_text(encoding="utf-8").splitlines()
|
||||
except Exception as exc:
|
||||
logger.warning("Konversation laden fehlgeschlagen: %s", exc)
|
||||
return
|
||||
loaded: List[Turn] = []
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
if obj.get("op") == "distill":
|
||||
# Marker: bis hierhin wurde alles destilliert
|
||||
drop_until_ts = obj.get("ts", "")
|
||||
if drop_until_ts:
|
||||
loaded = [t for t in loaded if t.ts > drop_until_ts]
|
||||
continue
|
||||
role = obj.get("role")
|
||||
content = obj.get("content")
|
||||
if role in ("user", "assistant") and isinstance(content, str):
|
||||
loaded.append(Turn(role=role, content=content,
|
||||
ts=obj.get("ts", ""),
|
||||
source=obj.get("source", "")))
|
||||
self.turns = loaded
|
||||
logger.info("Konversation geladen: %d Turns aus %s", len(self.turns), CONVERSATION_FILE)
|
||||
|
||||
def _append_to_file(self, record: dict):
|
||||
try:
|
||||
CONVERSATION_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with CONVERSATION_FILE.open("a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
except Exception as exc:
|
||||
logger.warning("Konversation persist fehlgeschlagen: %s", exc)
|
||||
|
||||
def add(self, role: str, content: str, source: str = "") -> Turn:
|
||||
t = Turn(role=role, content=content, source=source)
|
||||
self.turns.append(t)
|
||||
self._append_to_file({
|
||||
"ts": t.ts, "role": t.role, "content": t.content, "source": t.source,
|
||||
})
|
||||
return t
|
||||
|
||||
def window(self) -> List[Turn]:
|
||||
"""Die letzten max_window Turns — gehen in den LLM-Prompt."""
|
||||
return self.turns[-self.max_window:]
|
||||
|
||||
def needs_distill(self) -> bool:
|
||||
return len(self.turns) > self.distill_threshold
|
||||
|
||||
def take_oldest_for_distill(self) -> List[Turn]:
|
||||
"""Gibt die N aeltesten Turns zurueck — fuer den Destillat-Call.
|
||||
Entfernt sie NICHT — das macht commit_distill nach erfolgreichem Call."""
|
||||
return self.turns[: self.distill_count]
|
||||
|
||||
def commit_distill(self, last_distilled_ts: str):
|
||||
"""Schreibt einen Distill-Marker, entfernt aus dem In-Memory-Window."""
|
||||
self._append_to_file({"op": "distill", "ts": last_distilled_ts})
|
||||
self.turns = [t for t in self.turns if t.ts > last_distilled_ts]
|
||||
logger.info("Distill commit bei ts=%s — Window jetzt %d Turns", last_distilled_ts, len(self.turns))
|
||||
|
||||
def reset(self):
|
||||
"""Hardes Reset — verwende vorsichtig (Diagnostic-Button)."""
|
||||
try:
|
||||
if CONVERSATION_FILE.exists():
|
||||
CONVERSATION_FILE.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
self.turns = []
|
||||
logger.warning("Konversation komplett zurueckgesetzt")
|
||||
|
||||
def stats(self) -> dict:
|
||||
return {
|
||||
"turns": len(self.turns),
|
||||
"max_window": self.max_window,
|
||||
"distill_threshold": self.distill_threshold,
|
||||
"needs_distill": self.needs_distill(),
|
||||
}
|
||||
@@ -0,0 +1,518 @@
|
||||
"""
|
||||
ARIA Brain — FastAPI-Einstieg.
|
||||
|
||||
Phase B Punkt 1: nur Skeleton.
|
||||
- /health → Liveness
|
||||
- /memory/list → alle Punkte (gefiltert)
|
||||
- /memory/pinned → Hot Memory
|
||||
- /memory/search?q=...&k=5 → semantische Suche
|
||||
- /memory/save → neuen Punkt anlegen
|
||||
- /memory/update/{id} → Punkt aendern (re-embed wenn content geaendert)
|
||||
- /memory/delete/{id} → Punkt loeschen
|
||||
- /memory/stats → Anzahl Punkte pro Type
|
||||
|
||||
/chat (Conversation-Loop) und /skills/* kommen in spaeteren Phasen.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException, BackgroundTasks, Request
|
||||
from fastapi.responses import Response
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from memory import Embedder, VectorStore, MemoryPoint
|
||||
from conversation import Conversation
|
||||
from proxy_client import ProxyClient
|
||||
from agent import Agent
|
||||
import skills as skills_mod
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
|
||||
logger = logging.getLogger("aria-brain")
|
||||
|
||||
QDRANT_HOST = os.environ.get("QDRANT_HOST", "aria-qdrant")
|
||||
QDRANT_PORT = int(os.environ.get("QDRANT_PORT", "6333"))
|
||||
|
||||
app = FastAPI(title="ARIA Brain", version="0.1.0")
|
||||
|
||||
_embedder: Optional[Embedder] = None
|
||||
_store: Optional[VectorStore] = None
|
||||
_conversation: Optional[Conversation] = None
|
||||
_proxy: Optional[ProxyClient] = None
|
||||
_agent: Optional[Agent] = None
|
||||
|
||||
|
||||
def embedder() -> Embedder:
|
||||
global _embedder
|
||||
if _embedder is None:
|
||||
_embedder = Embedder()
|
||||
return _embedder
|
||||
|
||||
|
||||
def store() -> VectorStore:
|
||||
global _store
|
||||
if _store is None:
|
||||
_store = VectorStore(host=QDRANT_HOST, port=QDRANT_PORT)
|
||||
return _store
|
||||
|
||||
|
||||
def conversation() -> Conversation:
|
||||
global _conversation
|
||||
if _conversation is None:
|
||||
_conversation = Conversation()
|
||||
return _conversation
|
||||
|
||||
|
||||
def proxy_client() -> ProxyClient:
|
||||
global _proxy
|
||||
if _proxy is None:
|
||||
_proxy = ProxyClient()
|
||||
return _proxy
|
||||
|
||||
|
||||
def agent() -> Agent:
|
||||
global _agent
|
||||
if _agent is None:
|
||||
_agent = Agent(store(), embedder(), conversation(), proxy_client())
|
||||
return _agent
|
||||
|
||||
|
||||
# ─── Pydantic-Schemas ─────────────────────────────────────────────────
|
||||
|
||||
class MemoryIn(BaseModel):
|
||||
type: str = Field(..., description="identity|rule|preference|tool|skill|fact|conversation|reminder")
|
||||
title: str
|
||||
content: str
|
||||
pinned: bool = False
|
||||
category: str = ""
|
||||
source: str = "manual"
|
||||
tags: List[str] = Field(default_factory=list)
|
||||
conversation_id: Optional[str] = None
|
||||
|
||||
|
||||
class MemoryUpdate(BaseModel):
|
||||
title: Optional[str] = None
|
||||
content: Optional[str] = None
|
||||
pinned: Optional[bool] = None
|
||||
category: Optional[str] = None
|
||||
tags: Optional[List[str]] = None
|
||||
|
||||
|
||||
class MemoryOut(BaseModel):
|
||||
id: str
|
||||
type: str
|
||||
title: str
|
||||
content: str
|
||||
pinned: bool
|
||||
category: str
|
||||
source: str
|
||||
tags: List[str]
|
||||
created_at: str
|
||||
updated_at: str
|
||||
conversation_id: Optional[str] = None
|
||||
score: Optional[float] = None
|
||||
|
||||
@classmethod
|
||||
def from_point(cls, p: MemoryPoint) -> "MemoryOut":
|
||||
return cls(**p.__dict__)
|
||||
|
||||
|
||||
# ─── Health ───────────────────────────────────────────────────────────
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
try:
|
||||
n = store().count()
|
||||
return {"status": "ok", "memory_count": n, "qdrant": f"{QDRANT_HOST}:{QDRANT_PORT}"}
|
||||
except Exception as exc:
|
||||
return {"status": "degraded", "error": str(exc), "qdrant": f"{QDRANT_HOST}:{QDRANT_PORT}"}
|
||||
|
||||
|
||||
# ─── Memory-Endpoints ─────────────────────────────────────────────────
|
||||
|
||||
@app.get("/memory/stats")
|
||||
def memory_stats():
|
||||
s = store()
|
||||
points = s.list_all()
|
||||
by_type = {}
|
||||
pinned = 0
|
||||
for p in points:
|
||||
by_type[p.type] = by_type.get(p.type, 0) + 1
|
||||
if p.pinned:
|
||||
pinned += 1
|
||||
return {"total": len(points), "pinned": pinned, "by_type": by_type}
|
||||
|
||||
|
||||
@app.get("/memory/list", response_model=List[MemoryOut])
|
||||
def memory_list(type: Optional[str] = None, limit: int = 200):
|
||||
s = store()
|
||||
points = s.list_by_type(type, limit=limit) if type else s.list_all(limit=limit)
|
||||
return [MemoryOut.from_point(p) for p in points]
|
||||
|
||||
|
||||
@app.get("/memory/pinned", response_model=List[MemoryOut])
|
||||
def memory_pinned():
|
||||
return [MemoryOut.from_point(p) for p in store().list_pinned()]
|
||||
|
||||
|
||||
@app.get("/memory/search", response_model=List[MemoryOut])
|
||||
def memory_search(q: str, k: int = 5, type: Optional[str] = None, include_pinned: bool = False):
|
||||
vec = embedder().embed(q)
|
||||
points = store().search(vec, k=k, type_filter=type, exclude_pinned=not include_pinned)
|
||||
return [MemoryOut.from_point(p) for p in points]
|
||||
|
||||
|
||||
@app.post("/memory/save", response_model=MemoryOut)
|
||||
def memory_save(body: MemoryIn):
|
||||
s = store()
|
||||
vec = embedder().embed(body.content)
|
||||
point = MemoryPoint(
|
||||
id="",
|
||||
type=body.type,
|
||||
title=body.title,
|
||||
content=body.content,
|
||||
pinned=body.pinned,
|
||||
category=body.category,
|
||||
source=body.source,
|
||||
tags=body.tags,
|
||||
conversation_id=body.conversation_id,
|
||||
)
|
||||
pid = s.upsert(point, vec)
|
||||
saved = s.get(pid)
|
||||
return MemoryOut.from_point(saved)
|
||||
|
||||
|
||||
@app.patch("/memory/update/{point_id}", response_model=MemoryOut)
|
||||
def memory_update(point_id: str, body: MemoryUpdate):
|
||||
s = store()
|
||||
existing = s.get(point_id)
|
||||
if not existing:
|
||||
raise HTTPException(404, f"Memory {point_id} nicht gefunden")
|
||||
|
||||
content_changed = body.content is not None and body.content != existing.content
|
||||
if body.title is not None:
|
||||
existing.title = body.title
|
||||
if body.content is not None:
|
||||
existing.content = body.content
|
||||
if body.pinned is not None:
|
||||
existing.pinned = body.pinned
|
||||
if body.category is not None:
|
||||
existing.category = body.category
|
||||
if body.tags is not None:
|
||||
existing.tags = body.tags
|
||||
|
||||
vec = embedder().embed(existing.content) if content_changed else None
|
||||
if vec is None:
|
||||
# Vektor unveraendert lassen — nur Payload neu schreiben
|
||||
from qdrant_client.http import models as qm
|
||||
from memory.vector_store import COLLECTION
|
||||
s.client.set_payload(
|
||||
collection_name=COLLECTION,
|
||||
payload=existing.to_payload() | {"updated_at": __import__("datetime").datetime.now(__import__("datetime").timezone.utc).isoformat()},
|
||||
points=[point_id],
|
||||
)
|
||||
saved = s.get(point_id)
|
||||
else:
|
||||
s.upsert(existing, vec)
|
||||
saved = s.get(point_id)
|
||||
return MemoryOut.from_point(saved)
|
||||
|
||||
|
||||
@app.delete("/memory/delete/{point_id}")
|
||||
def memory_delete(point_id: str):
|
||||
s = store()
|
||||
if not s.get(point_id):
|
||||
raise HTTPException(404, f"Memory {point_id} nicht gefunden")
|
||||
s.delete(point_id)
|
||||
return {"deleted": point_id}
|
||||
|
||||
|
||||
# ─── Migration aus brain-import/ ──────────────────────────────────────
|
||||
|
||||
IMPORT_DIR = os.environ.get("IMPORT_DIR", "/import")
|
||||
|
||||
|
||||
@app.post("/memory/migrate")
|
||||
def memory_migrate():
|
||||
"""Liest /import/*.md und schreibt atomare Memory-Punkte in die DB.
|
||||
Idempotent: bei Re-Run werden Punkte mit gleicher migration_key ersetzt."""
|
||||
from pathlib import Path
|
||||
from migration import run_migration
|
||||
s = store()
|
||||
e = embedder()
|
||||
result = run_migration(Path(IMPORT_DIR), s, e)
|
||||
return result
|
||||
|
||||
|
||||
@app.get("/memory/import-files")
|
||||
def memory_import_files():
|
||||
"""Listet was unter /import/ liegt — fuer die Diagnostic-UI."""
|
||||
from pathlib import Path
|
||||
d = Path(IMPORT_DIR)
|
||||
if not d.exists():
|
||||
return {"import_dir": str(d), "exists": False, "files": []}
|
||||
out = []
|
||||
for p in sorted(d.iterdir()):
|
||||
if p.is_file():
|
||||
try:
|
||||
out.append({"name": p.name, "size": p.stat().st_size})
|
||||
except Exception:
|
||||
pass
|
||||
return {"import_dir": str(d), "exists": True, "files": out}
|
||||
|
||||
|
||||
# ─── Bootstrap-Snapshot ───────────────────────────────────────────────
|
||||
# "Bootstrap" = alle pinned Memories. Export/Import zum schnellen
|
||||
# Wiederherstellen einer schlanken ARIA nach Wipe.
|
||||
|
||||
@app.get("/memory/export-bootstrap")
|
||||
def memory_export_bootstrap():
|
||||
"""Gibt alle pinned Memories als JSON zurueck — fuer Browser-Download."""
|
||||
s = store()
|
||||
pinned = s.list_pinned()
|
||||
return {
|
||||
"version": 1,
|
||||
"exported_at": __import__("datetime").datetime.now(
|
||||
__import__("datetime").timezone.utc
|
||||
).isoformat(),
|
||||
"count": len(pinned),
|
||||
"memories": [
|
||||
{
|
||||
"type": p.type,
|
||||
"title": p.title,
|
||||
"content": p.content,
|
||||
"pinned": True,
|
||||
"category": p.category,
|
||||
"source": p.source,
|
||||
"tags": p.tags,
|
||||
}
|
||||
for p in pinned
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
class BootstrapBundle(BaseModel):
|
||||
version: int = 1
|
||||
memories: List[dict]
|
||||
|
||||
|
||||
@app.post("/memory/import-bootstrap")
|
||||
def memory_import_bootstrap(body: BootstrapBundle):
|
||||
"""Loescht alle pinned Memories und importiert die im Bundle.
|
||||
Cold Memory (unpinned) bleibt unangetastet.
|
||||
|
||||
Wenn keine Memories im Bundle: nur loeschen ist NICHT erlaubt — der
|
||||
Caller soll erst exportieren und dann importieren.
|
||||
"""
|
||||
if not body.memories:
|
||||
raise HTTPException(400, "Bundle hat keine memories — Abbruch zur Sicherheit")
|
||||
|
||||
s = store()
|
||||
e = embedder()
|
||||
|
||||
# Alle aktuell pinned Punkte loeschen
|
||||
from qdrant_client.http import models as qm
|
||||
from memory.vector_store import COLLECTION
|
||||
s.client.delete(
|
||||
collection_name=COLLECTION,
|
||||
points_selector=qm.FilterSelector(filter=qm.Filter(must=[
|
||||
qm.FieldCondition(key="pinned", match=qm.MatchValue(value=True))
|
||||
])),
|
||||
)
|
||||
|
||||
# Neue Punkte einspeisen
|
||||
created = 0
|
||||
for m in body.memories:
|
||||
content = (m.get("content") or "").strip()
|
||||
if not content:
|
||||
continue
|
||||
point = MemoryPoint(
|
||||
id="",
|
||||
type=m.get("type", "fact"),
|
||||
title=m.get("title", "(ohne Titel)"),
|
||||
content=content,
|
||||
pinned=True,
|
||||
category=m.get("category", ""),
|
||||
source=m.get("source", "bootstrap-import"),
|
||||
tags=list(m.get("tags", [])),
|
||||
)
|
||||
vec = e.embed(content)
|
||||
s.upsert(point, vec)
|
||||
created += 1
|
||||
|
||||
return {"created": created, "deleted_previous_pinned": True}
|
||||
|
||||
|
||||
# ─── Conversation-Loop ──────────────────────────────────────────────
|
||||
|
||||
class ChatIn(BaseModel):
|
||||
message: str
|
||||
source: str = "" # "app" / "diagnostic" / "stt" — optional
|
||||
|
||||
|
||||
class ChatOut(BaseModel):
|
||||
reply: str
|
||||
turns: int
|
||||
distilling: bool
|
||||
events: list = Field(default_factory=list)
|
||||
|
||||
|
||||
@app.post("/chat", response_model=ChatOut)
|
||||
def chat(body: ChatIn, background: BackgroundTasks):
|
||||
"""Hauptpfad. Antwort kommt synchron. Memory-Destillat laeuft
|
||||
im Hintergrund nachdem die Response rausging."""
|
||||
a = agent()
|
||||
try:
|
||||
reply = a.chat(body.message, source=body.source)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(400, str(exc))
|
||||
except RuntimeError as exc:
|
||||
logger.error("chat fehlgeschlagen: %s", exc)
|
||||
raise HTTPException(502, str(exc))
|
||||
|
||||
needs_distill = a.conversation.needs_distill()
|
||||
if needs_distill:
|
||||
background.add_task(a.distill_old_turns)
|
||||
return ChatOut(
|
||||
reply=reply,
|
||||
turns=len(a.conversation.turns),
|
||||
distilling=needs_distill,
|
||||
events=a.pop_events(),
|
||||
)
|
||||
|
||||
|
||||
@app.get("/conversation/stats")
|
||||
def conversation_stats():
|
||||
return conversation().stats()
|
||||
|
||||
|
||||
@app.post("/conversation/reset")
|
||||
def conversation_reset():
|
||||
"""Hardes Reset — der Rolling-Window-Verlauf wird komplett geleert.
|
||||
Destillierte facts bleiben in der DB."""
|
||||
conversation().reset()
|
||||
return {"ok": True, "turns": 0}
|
||||
|
||||
|
||||
@app.post("/conversation/distill")
|
||||
def conversation_distill_now():
|
||||
"""Manueller Trigger fuer Destillat — fuer Tests oder vor einem
|
||||
bewussten Reset."""
|
||||
return agent().distill_old_turns()
|
||||
|
||||
|
||||
# ─── Skills ─────────────────────────────────────────────────────────
|
||||
|
||||
class SkillCreate(BaseModel):
|
||||
name: str
|
||||
description: str
|
||||
execution: str # local-venv | local-bin | bash
|
||||
entry_code: str
|
||||
readme: str = ""
|
||||
args: list = Field(default_factory=list)
|
||||
requires: dict = Field(default_factory=dict)
|
||||
pip_packages: list = Field(default_factory=list)
|
||||
author: str = "stefan"
|
||||
|
||||
|
||||
class SkillRun(BaseModel):
|
||||
name: str
|
||||
args: dict = Field(default_factory=dict)
|
||||
timeout_sec: int = 300
|
||||
|
||||
|
||||
class SkillPatch(BaseModel):
|
||||
description: str | None = None
|
||||
active: bool | None = None
|
||||
args: list | None = None
|
||||
|
||||
|
||||
@app.get("/skills/list")
|
||||
def skills_list(active_only: bool = False):
|
||||
return {"skills": skills_mod.list_skills(active_only=active_only)}
|
||||
|
||||
|
||||
@app.get("/skills/{name}")
|
||||
def skills_get(name: str):
|
||||
m = skills_mod.read_manifest(name)
|
||||
if m is None:
|
||||
raise HTTPException(404, f"Skill '{name}' nicht gefunden")
|
||||
readme = skills_mod.read_readme(name)
|
||||
return {"manifest": m, "readme": readme}
|
||||
|
||||
|
||||
@app.post("/skills/create")
|
||||
def skills_create(body: SkillCreate):
|
||||
try:
|
||||
return skills_mod.create_skill(
|
||||
name=body.name,
|
||||
description=body.description,
|
||||
execution=body.execution,
|
||||
entry_code=body.entry_code,
|
||||
readme=body.readme,
|
||||
args=body.args,
|
||||
requires=body.requires,
|
||||
pip_packages=body.pip_packages,
|
||||
author=body.author,
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(400, str(exc))
|
||||
|
||||
|
||||
@app.post("/skills/run")
|
||||
def skills_run(body: SkillRun):
|
||||
try:
|
||||
return skills_mod.run_skill(body.name, args=body.args, timeout_sec=body.timeout_sec)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(400, str(exc))
|
||||
|
||||
|
||||
@app.patch("/skills/{name}")
|
||||
def skills_patch(name: str, body: SkillPatch):
|
||||
patch = {k: v for k, v in body.model_dump().items() if v is not None}
|
||||
try:
|
||||
return skills_mod.update_skill(name, patch)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(404, str(exc))
|
||||
|
||||
|
||||
@app.delete("/skills/{name}")
|
||||
def skills_delete(name: str):
|
||||
try:
|
||||
skills_mod.delete_skill(name)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(404, str(exc))
|
||||
return {"deleted": name}
|
||||
|
||||
|
||||
@app.get("/skills/{name}/logs")
|
||||
def skills_logs(name: str, limit: int = 50):
|
||||
return {"logs": skills_mod.list_logs(name, limit=limit)}
|
||||
|
||||
|
||||
@app.get("/skills/{name}/export")
|
||||
def skills_export(name: str):
|
||||
try:
|
||||
data = skills_mod.export_skill(name)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(404, str(exc))
|
||||
return Response(
|
||||
content=data,
|
||||
media_type="application/gzip",
|
||||
headers={"Content-Disposition": f'attachment; filename="skill-{name}.tar.gz"'},
|
||||
)
|
||||
|
||||
|
||||
@app.post("/skills/import")
|
||||
async def skills_import(request: Request, overwrite: bool = False):
|
||||
data = await request.body()
|
||||
if not data:
|
||||
raise HTTPException(400, "Leerer Body")
|
||||
try:
|
||||
manifest = skills_mod.import_skill(data, overwrite=overwrite)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(400, str(exc))
|
||||
return {"imported": manifest}
|
||||
@@ -0,0 +1,4 @@
|
||||
from .embedder import Embedder
|
||||
from .vector_store import VectorStore, MemoryPoint, MemoryType
|
||||
|
||||
__all__ = ["Embedder", "VectorStore", "MemoryPoint", "MemoryType"]
|
||||
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
Lokaler Embedder fuer Memory-Texte.
|
||||
|
||||
Nutzt sentence-transformers (paraphrase-multilingual-MiniLM-L12-v2):
|
||||
- Deutsch + Englisch
|
||||
- 384-dimensionale Vektoren
|
||||
- Laeuft auf CPU, ~30ms pro kurzer Text
|
||||
- Modell wird beim ersten Aufruf in /data/_models gecached
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
|
||||
VECTOR_DIM = 384
|
||||
|
||||
|
||||
class Embedder:
|
||||
def __init__(self, model_name: str = MODEL_NAME):
|
||||
self.model_name = model_name
|
||||
self._model = None
|
||||
|
||||
def _load(self):
|
||||
if self._model is None:
|
||||
logger.info("Lade Embedding-Modell %s ...", self.model_name)
|
||||
from sentence_transformers import SentenceTransformer
|
||||
self._model = SentenceTransformer(self.model_name)
|
||||
logger.info("Embedding-Modell geladen.")
|
||||
|
||||
def embed(self, text: str) -> List[float]:
|
||||
self._load()
|
||||
vec = self._model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
|
||||
return vec.tolist()
|
||||
|
||||
def embed_batch(self, texts: List[str]) -> List[List[float]]:
|
||||
self._load()
|
||||
vecs = self._model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
|
||||
return vecs.tolist()
|
||||
@@ -0,0 +1,209 @@
|
||||
"""
|
||||
Vector-Store-Wrapper um Qdrant.
|
||||
|
||||
Eine Collection "aria_memory" haelt ALLE Memory-Punkte.
|
||||
Trennung nach Type/Pinned-Status via Payload-Filter.
|
||||
|
||||
Punkt-Schema (Payload):
|
||||
type — identity | rule | preference | tool | skill | fact | conversation | reminder
|
||||
category — frei, fuer UI-Gruppierung
|
||||
title — kurze Ueberschrift
|
||||
content — eigentlicher Text (wird embedded)
|
||||
pinned — bool, True = Hot Memory (immer in Prompt)
|
||||
source — import | conversation | manual
|
||||
tags — Liste von Strings
|
||||
created_at, updated_at — ISO-Strings
|
||||
conversation_id — optional, nur fuer type=conversation
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as qm
|
||||
|
||||
from .embedder import VECTOR_DIM
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
COLLECTION = "aria_memory"
|
||||
|
||||
|
||||
class MemoryType(str, Enum):
|
||||
IDENTITY = "identity"
|
||||
RULE = "rule"
|
||||
PREFERENCE = "preference"
|
||||
TOOL = "tool"
|
||||
SKILL = "skill"
|
||||
FACT = "fact"
|
||||
CONVERSATION = "conversation"
|
||||
REMINDER = "reminder"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MemoryPoint:
|
||||
id: str
|
||||
type: str
|
||||
title: str
|
||||
content: str
|
||||
pinned: bool = False
|
||||
category: str = ""
|
||||
source: str = "manual"
|
||||
tags: List[str] = field(default_factory=list)
|
||||
created_at: str = ""
|
||||
updated_at: str = ""
|
||||
conversation_id: Optional[str] = None
|
||||
score: Optional[float] = None # nur bei Search gesetzt
|
||||
|
||||
def to_payload(self) -> dict:
|
||||
p = {
|
||||
"type": self.type,
|
||||
"title": self.title,
|
||||
"content": self.content,
|
||||
"pinned": self.pinned,
|
||||
"category": self.category,
|
||||
"source": self.source,
|
||||
"tags": self.tags,
|
||||
"created_at": self.created_at,
|
||||
"updated_at": self.updated_at,
|
||||
}
|
||||
if self.conversation_id:
|
||||
p["conversation_id"] = self.conversation_id
|
||||
return p
|
||||
|
||||
@classmethod
|
||||
def from_qdrant(cls, point) -> "MemoryPoint":
|
||||
payload = point.payload or {}
|
||||
return cls(
|
||||
id=str(point.id),
|
||||
type=payload.get("type", "fact"),
|
||||
title=payload.get("title", ""),
|
||||
content=payload.get("content", ""),
|
||||
pinned=payload.get("pinned", False),
|
||||
category=payload.get("category", ""),
|
||||
source=payload.get("source", "manual"),
|
||||
tags=payload.get("tags", []),
|
||||
created_at=payload.get("created_at", ""),
|
||||
updated_at=payload.get("updated_at", ""),
|
||||
conversation_id=payload.get("conversation_id"),
|
||||
score=getattr(point, "score", None),
|
||||
)
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
class VectorStore:
|
||||
def __init__(self, host: str, port: int = 6333):
|
||||
self.client = QdrantClient(host=host, port=port)
|
||||
self._ensure_collection()
|
||||
|
||||
def _ensure_collection(self):
|
||||
existing = [c.name for c in self.client.get_collections().collections]
|
||||
if COLLECTION not in existing:
|
||||
logger.info("Erstelle Collection %s ...", COLLECTION)
|
||||
self.client.create_collection(
|
||||
collection_name=COLLECTION,
|
||||
vectors_config=qm.VectorParams(size=VECTOR_DIM, distance=qm.Distance.COSINE),
|
||||
)
|
||||
# Indexe fuer typische Filter-Felder
|
||||
for field_name in ("type", "pinned", "category", "source", "migration_key"):
|
||||
self.client.create_payload_index(
|
||||
collection_name=COLLECTION,
|
||||
field_name=field_name,
|
||||
field_schema=qm.PayloadSchemaType.KEYWORD if field_name != "pinned"
|
||||
else qm.PayloadSchemaType.BOOL,
|
||||
)
|
||||
|
||||
# ─── Schreib-Operationen ─────────────────────────────────────────
|
||||
|
||||
def upsert(self, point: MemoryPoint, vector: List[float]) -> str:
|
||||
if not point.id:
|
||||
point.id = str(uuid.uuid4())
|
||||
if not point.created_at:
|
||||
point.created_at = _now()
|
||||
point.updated_at = _now()
|
||||
|
||||
self.client.upsert(
|
||||
collection_name=COLLECTION,
|
||||
points=[qm.PointStruct(id=point.id, vector=vector, payload=point.to_payload())],
|
||||
)
|
||||
return point.id
|
||||
|
||||
def delete(self, point_id: str):
|
||||
self.client.delete(
|
||||
collection_name=COLLECTION,
|
||||
points_selector=qm.PointIdsList(points=[point_id]),
|
||||
)
|
||||
|
||||
# ─── Lese-Operationen ────────────────────────────────────────────
|
||||
|
||||
def get(self, point_id: str) -> Optional[MemoryPoint]:
|
||||
result = self.client.retrieve(collection_name=COLLECTION, ids=[point_id], with_payload=True)
|
||||
if not result:
|
||||
return None
|
||||
return MemoryPoint.from_qdrant(result[0])
|
||||
|
||||
def list_pinned(self) -> List[MemoryPoint]:
|
||||
"""Alle pinned Punkte — Hot Memory."""
|
||||
return self._scroll(filter=qm.Filter(must=[
|
||||
qm.FieldCondition(key="pinned", match=qm.MatchValue(value=True))
|
||||
]))
|
||||
|
||||
def list_by_type(self, type_: str, limit: int = 100) -> List[MemoryPoint]:
|
||||
return self._scroll(
|
||||
filter=qm.Filter(must=[
|
||||
qm.FieldCondition(key="type", match=qm.MatchValue(value=type_))
|
||||
]),
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
def list_all(self, limit: int = 1000) -> List[MemoryPoint]:
|
||||
return self._scroll(filter=None, limit=limit)
|
||||
|
||||
def _scroll(self, filter, limit: int = 1000) -> List[MemoryPoint]:
|
||||
points, _ = self.client.scroll(
|
||||
collection_name=COLLECTION,
|
||||
scroll_filter=filter,
|
||||
limit=limit,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
return [MemoryPoint.from_qdrant(p) for p in points]
|
||||
|
||||
def search(
|
||||
self,
|
||||
query_vector: List[float],
|
||||
k: int = 5,
|
||||
type_filter: Optional[str] = None,
|
||||
exclude_pinned: bool = True,
|
||||
) -> List[MemoryPoint]:
|
||||
"""Semantische Search. Standard: pinned-Punkte ausgeschlossen
|
||||
(die kommen separat via list_pinned in den Prompt)."""
|
||||
must = []
|
||||
must_not = []
|
||||
if type_filter:
|
||||
must.append(qm.FieldCondition(key="type", match=qm.MatchValue(value=type_filter)))
|
||||
if exclude_pinned:
|
||||
must_not.append(qm.FieldCondition(key="pinned", match=qm.MatchValue(value=True)))
|
||||
|
||||
flt = qm.Filter(must=must or None, must_not=must_not or None)
|
||||
|
||||
results = self.client.search(
|
||||
collection_name=COLLECTION,
|
||||
query_vector=query_vector,
|
||||
query_filter=flt if (must or must_not) else None,
|
||||
limit=k,
|
||||
with_payload=True,
|
||||
)
|
||||
return [MemoryPoint.from_qdrant(p) for p in results]
|
||||
|
||||
def count(self) -> int:
|
||||
return self.client.count(collection_name=COLLECTION, exact=True).count
|
||||
@@ -0,0 +1,399 @@
|
||||
"""
|
||||
Migration aus aria-data/brain-import/ → Vector-DB.
|
||||
|
||||
Parst die mitgelieferten Markdown-Dateien (AGENT.md, USER.md, TOOLING.md)
|
||||
und zerlegt sie in atomare Memory-Punkte. Jeder Punkt bekommt:
|
||||
|
||||
source = "import"
|
||||
migration_key = stabiler Identifier (z.B. "agent.md/rule-1") fuer Idempotenz
|
||||
pinned = True
|
||||
|
||||
Beim Re-Run werden vorhandene Punkte mit gleicher migration_key entfernt
|
||||
und neu geschrieben.
|
||||
|
||||
Mapping pro Datei:
|
||||
|
||||
AGENT.md
|
||||
"Identitaet" → 1 Punkt type=identity
|
||||
"Persoenlichkeit" (Intro) → 1 Punkt type=identity
|
||||
"Kern-Eigenschaften" (Liste) → 1 Punkt pro Bullet type=identity
|
||||
"Tool-Freigaben" → 1 Punkt type=tool
|
||||
"Sicherheitsregeln" (Liste) → 1 Punkt pro Bullet type=rule
|
||||
"Arbeitsprinzipien" (Liste) → 1 Punkt pro Bullet type=rule
|
||||
"Dateien an Stefan zurueckgeben"→ 1 Punkt type=skill
|
||||
"Stimme" → 1 Punkt type=tool
|
||||
|
||||
USER.md
|
||||
"Allgemein" (Liste) → 1 Punkt pro Bullet type=preference
|
||||
"Bestaetigung erforderlich" → 1 Punkt type=preference
|
||||
"Autonomes Arbeiten OK fuer" → 1 Punkt type=preference
|
||||
"Tools & Infrastruktur" → 1 Punkt type=preference
|
||||
|
||||
TOOLING.md
|
||||
gesamter Inhalt → 1 Punkt type=tool, title="Tooling-Stack"
|
||||
|
||||
BOOTSTRAP.md ist eine Variante von AGENT.md — wird (vorerst) ignoriert
|
||||
damit keine doppelten Punkte landen.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from memory import Embedder, VectorStore, MemoryPoint
|
||||
from memory.vector_store import COLLECTION
|
||||
from qdrant_client.http import models as qm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Block:
|
||||
title: str
|
||||
content: str
|
||||
|
||||
|
||||
def _split_h2(md: str) -> List[_Block]:
|
||||
"""Zerlegt Markdown in H2-Bloecke. Inhalt vor dem ersten H2 wird verworfen."""
|
||||
blocks: List[_Block] = []
|
||||
current: Optional[_Block] = None
|
||||
for line in md.splitlines():
|
||||
m = re.match(r"^##\s+(.+?)\s*$", line)
|
||||
if m and not line.startswith("### "):
|
||||
if current:
|
||||
blocks.append(current)
|
||||
current = _Block(title=m.group(1).strip(), content="")
|
||||
continue
|
||||
if current is not None:
|
||||
current.content += line + "\n"
|
||||
if current:
|
||||
blocks.append(current)
|
||||
return blocks
|
||||
|
||||
|
||||
def _split_h3(content: str) -> List[_Block]:
|
||||
"""Zerlegt einen H2-Block in H3-Untersektionen + 'header'-Block davor."""
|
||||
blocks: List[_Block] = []
|
||||
header_lines: List[str] = []
|
||||
current: Optional[_Block] = None
|
||||
for line in content.splitlines():
|
||||
m = re.match(r"^###\s+(.+?)\s*$", line)
|
||||
if m:
|
||||
if current is None and header_lines:
|
||||
blocks.append(_Block(title="_intro", content="\n".join(header_lines).strip()))
|
||||
if current:
|
||||
blocks.append(current)
|
||||
current = _Block(title=m.group(1).strip(), content="")
|
||||
continue
|
||||
if current is None:
|
||||
header_lines.append(line)
|
||||
else:
|
||||
current.content += line + "\n"
|
||||
if current:
|
||||
blocks.append(current)
|
||||
elif header_lines:
|
||||
blocks.append(_Block(title="_intro", content="\n".join(header_lines).strip()))
|
||||
return blocks
|
||||
|
||||
|
||||
def _extract_bullets(content: str) -> List[tuple[str, str]]:
|
||||
"""Findet "- **Title** — Body" oder "N. **Title** — Body" Bullets.
|
||||
|
||||
Returns: Liste von (title, full_bullet_text).
|
||||
"""
|
||||
bullets: List[tuple[str, str]] = []
|
||||
current_lines: List[str] = []
|
||||
current_title: Optional[str] = None
|
||||
|
||||
def flush():
|
||||
if current_title and current_lines:
|
||||
bullets.append((current_title, "\n".join(current_lines).strip()))
|
||||
|
||||
for line in content.splitlines():
|
||||
m = re.match(r"^\s*(?:[-*]|\d+\.)\s+\*\*([^*]+?)\*\*\s*[—\-:]?\s*(.*)$", line)
|
||||
if m:
|
||||
flush()
|
||||
current_title = m.group(1).strip()
|
||||
current_lines = [line]
|
||||
continue
|
||||
# Folge-Zeilen mit Einrueckung gehoeren zum aktuellen Bullet
|
||||
if current_title and (line.startswith(" ") or line.startswith("\t") or not line.strip()):
|
||||
current_lines.append(line)
|
||||
continue
|
||||
if current_title and not re.match(r"^\s*(?:[-*]|\d+\.)\s+", line):
|
||||
current_lines.append(line)
|
||||
continue
|
||||
# Neuer Bullet ohne **Title** Format
|
||||
if re.match(r"^\s*(?:[-*]|\d+\.)\s+", line):
|
||||
flush()
|
||||
text = re.sub(r"^\s*(?:[-*]|\d+\.)\s+", "", line).strip()
|
||||
short_title = (text[:60] + "…") if len(text) > 60 else text
|
||||
bullets.append((short_title, line.strip()))
|
||||
current_title = None
|
||||
current_lines = []
|
||||
flush()
|
||||
return bullets
|
||||
|
||||
|
||||
# ─── Pro Datei eine Parser-Funktion ──────────────────────────────────
|
||||
|
||||
def _parse_agent_md(md: str, source_file: str) -> List[MemoryPoint]:
|
||||
points: List[MemoryPoint] = []
|
||||
h2_blocks = _split_h2(md)
|
||||
for h2 in h2_blocks:
|
||||
title = h2.title
|
||||
content = h2.content.strip()
|
||||
if not content:
|
||||
continue
|
||||
|
||||
if title.lower() == "identitaet" or title.lower() == "identität":
|
||||
points.append(_mk(
|
||||
type_="identity", title="ARIA — Identitaet",
|
||||
content=f"## {title}\n\n{content}",
|
||||
category="persoenlichkeit",
|
||||
migration_key=f"{source_file}/identity",
|
||||
))
|
||||
|
||||
elif title.lower() == "persoenlichkeit" or title.lower() == "persönlichkeit":
|
||||
# Intro-Absatz + Kern-Eigenschaften-Liste trennen
|
||||
sub = _split_h3(content)
|
||||
for s in sub:
|
||||
if s.title == "_intro" and s.content.strip():
|
||||
points.append(_mk(
|
||||
type_="identity", title="Persoenlichkeit — Grundsatz",
|
||||
content=s.content.strip(),
|
||||
category="persoenlichkeit",
|
||||
migration_key=f"{source_file}/personality-intro",
|
||||
))
|
||||
elif s.title.lower().startswith("kern"):
|
||||
for idx, (btitle, btext) in enumerate(_extract_bullets(s.content), 1):
|
||||
points.append(_mk(
|
||||
type_="identity", title=f"Eigenschaft: {btitle}",
|
||||
content=btext, category="persoenlichkeit",
|
||||
migration_key=f"{source_file}/personality-trait-{idx}",
|
||||
))
|
||||
|
||||
elif "sicherheitsregel" in title.lower():
|
||||
for idx, (btitle, btext) in enumerate(_extract_bullets(content), 1):
|
||||
points.append(_mk(
|
||||
type_="rule", title=f"Sicherheit: {btitle}",
|
||||
content=btext, category="sicherheit",
|
||||
migration_key=f"{source_file}/security-{idx}",
|
||||
))
|
||||
|
||||
elif "arbeitsprinzipien" in title.lower() or "arbeitsprinzip" in title.lower():
|
||||
for idx, (btitle, btext) in enumerate(_extract_bullets(content), 1):
|
||||
points.append(_mk(
|
||||
type_="rule", title=f"Prinzip: {btitle}",
|
||||
content=btext, category="arbeitsweise",
|
||||
migration_key=f"{source_file}/work-principle-{idx}",
|
||||
))
|
||||
|
||||
elif "tool-freigaben" in title.lower() or "tool freigaben" in title.lower():
|
||||
points.append(_mk(
|
||||
type_="tool", title="Tool-Freigaben — Vollzugriff",
|
||||
content=content, category="infrastruktur",
|
||||
migration_key=f"{source_file}/tool-access",
|
||||
))
|
||||
|
||||
elif "dateien an stefan" in title.lower() or "dateien zurueckgeben" in title.lower() or "dateien zur" in title.lower():
|
||||
points.append(_mk(
|
||||
type_="skill", title="Dateien an User zurueckgeben",
|
||||
content=content, category="ausgabe",
|
||||
migration_key=f"{source_file}/file-return-skill",
|
||||
))
|
||||
|
||||
elif title.lower() == "stimme":
|
||||
points.append(_mk(
|
||||
type_="tool", title="Stimme (F5-TTS)",
|
||||
content=content, category="infrastruktur",
|
||||
migration_key=f"{source_file}/voice",
|
||||
))
|
||||
|
||||
# Permanente Freigaben (in BOOTSTRAP) — als rule
|
||||
elif "freigaben" in title.lower():
|
||||
points.append(_mk(
|
||||
type_="rule", title=title,
|
||||
content=content, category="freigaben",
|
||||
migration_key=f"{source_file}/permissions",
|
||||
))
|
||||
|
||||
else:
|
||||
# Unbekannter Block: als generischer fact ablegen, NICHT pinned
|
||||
logger.info("Unbekannter H2-Block '%s' in %s — als fact (unpinned)", title, source_file)
|
||||
points.append(_mk(
|
||||
type_="fact", title=f"{source_file}: {title}",
|
||||
content=content, pinned=False,
|
||||
migration_key=f"{source_file}/section-{title.lower().replace(' ', '-')}",
|
||||
))
|
||||
return points
|
||||
|
||||
|
||||
def _parse_user_md(md: str, source_file: str) -> List[MemoryPoint]:
|
||||
points: List[MemoryPoint] = []
|
||||
for h2 in _split_h2(md):
|
||||
title = h2.title
|
||||
content = h2.content.strip()
|
||||
if not content:
|
||||
continue
|
||||
# Template-Platzhalter herausfiltern: Beispiel-Zeilen mit <Tag>
|
||||
if "<Beispiel-Tool>" in content or "<Username>" in title:
|
||||
continue
|
||||
if title.lower() == "allgemein":
|
||||
for idx, (btitle, btext) in enumerate(_extract_bullets(content), 1):
|
||||
# Template-Platzhalter ueberspringen
|
||||
if "<z.B." in btext or "<XYZ>" in btext:
|
||||
continue
|
||||
points.append(_mk(
|
||||
type_="preference", title=f"User: {btitle}",
|
||||
content=btext, category="allgemein",
|
||||
migration_key=f"{source_file}/general-{idx}",
|
||||
))
|
||||
else:
|
||||
cat_key = re.sub(r"[^a-z0-9]+", "-", title.lower()).strip("-") or "allgemein"
|
||||
points.append(_mk(
|
||||
type_="preference", title=title,
|
||||
content=content, category=cat_key,
|
||||
migration_key=f"{source_file}/{cat_key}",
|
||||
))
|
||||
return points
|
||||
|
||||
|
||||
def _parse_tooling_md(md: str, source_file: str) -> List[MemoryPoint]:
|
||||
md = md.strip()
|
||||
if not md:
|
||||
return []
|
||||
return [_mk(
|
||||
type_="tool", title="Tooling-Stack (VM)",
|
||||
content=md, category="infrastruktur",
|
||||
migration_key=f"{source_file}/tooling-full",
|
||||
)]
|
||||
|
||||
|
||||
# ─── Helper ─────────────────────────────────────────────────────────
|
||||
|
||||
def _mk(
|
||||
type_: str,
|
||||
title: str,
|
||||
content: str,
|
||||
migration_key: str,
|
||||
pinned: bool = True,
|
||||
category: str = "",
|
||||
) -> MemoryPoint:
|
||||
p = MemoryPoint(
|
||||
id="",
|
||||
type=type_,
|
||||
title=title,
|
||||
content=content.strip(),
|
||||
pinned=pinned,
|
||||
category=category,
|
||||
source="import",
|
||||
tags=[],
|
||||
)
|
||||
# migration_key wird ueber Payload-Index angesprochen — in to_payload manuell anhaengen
|
||||
setattr(p, "_migration_key", migration_key)
|
||||
return p
|
||||
|
||||
|
||||
# ─── Eintrittspunkt ─────────────────────────────────────────────────
|
||||
|
||||
def run_migration(
|
||||
import_dir: Path,
|
||||
store: VectorStore,
|
||||
embedder: Embedder,
|
||||
) -> dict:
|
||||
"""Liest alle .md-Dateien aus import_dir, parst sie, schreibt in DB.
|
||||
|
||||
Idempotent: vorhandene Punkte mit gleicher migration_key werden geloescht
|
||||
und neu geschrieben.
|
||||
|
||||
Returns: {"created": int, "updated": int, "skipped": int, "files": [...]}
|
||||
"""
|
||||
if not import_dir.exists():
|
||||
return {"created": 0, "updated": 0, "skipped": 0, "files": [], "error": f"{import_dir} nicht gefunden"}
|
||||
|
||||
parsers = {
|
||||
"AGENT.md": _parse_agent_md,
|
||||
"BOOTSTRAP.md": _parse_agent_md, # gleicher Parser, ggf. ueberlappende Eintraege
|
||||
"USER.md": _parse_user_md,
|
||||
"USER.md.example": _parse_user_md,
|
||||
"TOOLING.md": _parse_tooling_md,
|
||||
"TOOLING.md.example": _parse_tooling_md,
|
||||
}
|
||||
|
||||
# USER.md hat Vorrang vor USER.md.example
|
||||
file_priority = ["AGENT.md", "BOOTSTRAP.md", "USER.md", "USER.md.example",
|
||||
"TOOLING.md", "TOOLING.md.example"]
|
||||
seen_kinds: set[str] = set() # "USER" / "TOOLING" — nur einmal
|
||||
|
||||
points: List[MemoryPoint] = []
|
||||
processed_files: List[str] = []
|
||||
|
||||
for fname in file_priority:
|
||||
fp = import_dir / fname
|
||||
if not fp.exists():
|
||||
continue
|
||||
kind = fname.split(".")[0] # "AGENT", "BOOTSTRAP", "USER", "TOOLING"
|
||||
# USER.md.example nur wenn USER.md fehlt
|
||||
if kind in ("USER", "TOOLING") and kind in seen_kinds:
|
||||
continue
|
||||
seen_kinds.add(kind)
|
||||
parser = parsers.get(fname)
|
||||
if not parser:
|
||||
continue
|
||||
try:
|
||||
md = fp.read_text(encoding="utf-8")
|
||||
file_points = parser(md, fname)
|
||||
points.extend(file_points)
|
||||
processed_files.append(f"{fname} ({len(file_points)})")
|
||||
logger.info("Migration: %s → %d Punkte", fname, len(file_points))
|
||||
except Exception as exc:
|
||||
logger.exception("Migration: %s fehlgeschlagen", fname)
|
||||
processed_files.append(f"{fname} (FEHLER: {exc})")
|
||||
|
||||
if not points:
|
||||
return {"created": 0, "updated": 0, "skipped": 0, "files": processed_files}
|
||||
|
||||
# Erst alte Migration-Punkte mit gleicher migration_key loeschen
|
||||
migration_keys = [getattr(p, "_migration_key", None) for p in points]
|
||||
migration_keys = [k for k in migration_keys if k]
|
||||
if migration_keys:
|
||||
store.client.delete(
|
||||
collection_name=COLLECTION,
|
||||
points_selector=qm.FilterSelector(filter=qm.Filter(must=[
|
||||
qm.FieldCondition(key="migration_key", match=qm.MatchAny(any=migration_keys))
|
||||
])),
|
||||
)
|
||||
logger.info("Migration: %d alte Punkte mit gleicher migration_key entfernt", len(migration_keys))
|
||||
|
||||
# Embed in Batches
|
||||
texts = [p.content for p in points]
|
||||
vectors = embedder.embed_batch(texts)
|
||||
|
||||
created = 0
|
||||
for p, vec in zip(points, vectors):
|
||||
payload = p.to_payload()
|
||||
mkey = getattr(p, "_migration_key", None)
|
||||
if mkey:
|
||||
payload["migration_key"] = mkey
|
||||
from datetime import datetime, timezone
|
||||
import uuid as _uuid
|
||||
pid = str(_uuid.uuid4())
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
payload["created_at"] = now
|
||||
payload["updated_at"] = now
|
||||
store.client.upsert(
|
||||
collection_name=COLLECTION,
|
||||
points=[qm.PointStruct(id=pid, vector=vec, payload=payload)],
|
||||
)
|
||||
created += 1
|
||||
|
||||
return {
|
||||
"created": created,
|
||||
"files": processed_files,
|
||||
"import_dir": str(import_dir),
|
||||
}
|
||||
@@ -0,0 +1,131 @@
|
||||
"""
|
||||
System-Prompt-Bau aus Memory-Punkten.
|
||||
|
||||
Strategie:
|
||||
1. Alle pinned Punkte (Hot Memory) — gruppiert nach Type — in den
|
||||
System-Prompt schreiben. IMMER drin.
|
||||
2. Top-K semantisch aehnliche Punkte (Cold Memory) zur aktuellen
|
||||
User-Nachricht — als "Moeglicherweise relevant" eingehaengt.
|
||||
3. Aktive Skills als kompakte Liste (nur Name + Description) — damit
|
||||
ARIA weiss was sie hat.
|
||||
|
||||
Phase B Punkt 1: nur Hot-Memory-Bau, Skills + Cold-Search kommen
|
||||
mit dem Conversation-Loop in spaeteren Phasen.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List
|
||||
|
||||
from memory import MemoryPoint
|
||||
|
||||
TYPE_HEADINGS = {
|
||||
"identity": "## Wer du bist",
|
||||
"rule": "## Sicherheitsregeln & Prinzipien",
|
||||
"preference": "## Benutzer-Praeferenzen",
|
||||
"tool": "## Tool-Freigaben",
|
||||
"skill": "## Deine Skills",
|
||||
}
|
||||
|
||||
|
||||
def build_hot_memory_section(pinned: List[MemoryPoint]) -> str:
|
||||
"""Baue den 'IMMER-im-Prompt'-Block aus pinned Punkten."""
|
||||
grouped: dict[str, List[MemoryPoint]] = {}
|
||||
for p in pinned:
|
||||
grouped.setdefault(p.type, []).append(p)
|
||||
|
||||
parts: List[str] = []
|
||||
# Sortier-Reihenfolge: identity → rule → preference → tool → skill → Rest
|
||||
order = ["identity", "rule", "preference", "tool", "skill"]
|
||||
for t in order:
|
||||
items = grouped.pop(t, [])
|
||||
if not items:
|
||||
continue
|
||||
parts.append(TYPE_HEADINGS.get(t, f"## {t}"))
|
||||
for p in items:
|
||||
parts.append(f"### {p.title}")
|
||||
parts.append(p.content.strip())
|
||||
parts.append("")
|
||||
|
||||
# uebrige Types (falls jemand was anderes als pinned markiert)
|
||||
for t, items in grouped.items():
|
||||
parts.append(f"## {t}")
|
||||
for p in items:
|
||||
parts.append(f"### {p.title}")
|
||||
parts.append(p.content.strip())
|
||||
parts.append("")
|
||||
|
||||
return "\n".join(parts).strip()
|
||||
|
||||
|
||||
def build_cold_memory_section(matches: List[MemoryPoint]) -> str:
|
||||
"""Baue 'Moeglicherweise relevant'-Block aus Search-Treffern."""
|
||||
if not matches:
|
||||
return ""
|
||||
lines = ["## Moeglicherweise relevant (aus Gedaechtnis)"]
|
||||
for p in matches:
|
||||
score = f" [score={p.score:.2f}]" if p.score is not None else ""
|
||||
lines.append(f"- **{p.title}**{score}")
|
||||
lines.append(f" {p.content.strip()}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_skills_section(skills: List[dict]) -> str:
|
||||
"""Listet alle Skills (aktiv + deaktiviert) damit ARIA weiss was es gibt
|
||||
und keine doppelt baut. Plus klare Schwelle wann ein Skill sich lohnt."""
|
||||
lines = ["## Deine Skills"]
|
||||
if skills:
|
||||
for s in skills:
|
||||
active = s.get("active", True)
|
||||
marker = "" if active else " [DEAKTIVIERT — kann nicht aufgerufen werden]"
|
||||
lines.append(f"- **{s.get('name', '?')}**{marker} — {s.get('description', '(ohne Beschreibung)')}")
|
||||
lines.append("")
|
||||
lines.append("Wenn ein vorhandener Skill zur Aufgabe passt: nutze ihn via Tool-Call.")
|
||||
else:
|
||||
lines.append("(noch keine Skills vorhanden)")
|
||||
|
||||
lines.append("")
|
||||
lines.append("### Wann lohnt sich ein neuer Skill?")
|
||||
lines.append("")
|
||||
lines.append("**Skills sind IMMER Python** — eigene venv pro Skill mit den noetigen "
|
||||
"pip-Paketen. Kein apt im Skill, kein systemweiter Install. Python deckt "
|
||||
"in der Regel alles ab (yt-dlp, requests, pypdf, pillow, openpyxl, "
|
||||
"static-ffmpeg, beautifulsoup4, …). Falls etwas WIRKLICH nur via apt geht: "
|
||||
"Stefan fragen ob es ins Brain-Dockerfile soll.")
|
||||
lines.append("")
|
||||
lines.append("**Harte Regel — IMMER Skill anlegen wenn:** die Loesung erfordert eine "
|
||||
"pip-Library. Begruendung: Brain-Container hat keinen persistenten State "
|
||||
"ausser /data/skills/. Ohne Skill wuerde der Install bei jedem "
|
||||
"Container-Restart wiederholt.")
|
||||
lines.append("")
|
||||
lines.append("**Sonst — Skill nur wenn alle vier zutreffen:**")
|
||||
lines.append("")
|
||||
lines.append("1. **Wiederkehrend** — die Aufgabe wird realistisch nochmal gestellt. "
|
||||
"Einmal-Faelle (\"wie spaet ist es jetzt\") kein Skill.")
|
||||
lines.append("2. **Nicht-trivial** — mehrere Schritte. Ein einzelner Shell-Befehl "
|
||||
"(`date`, `hostname`, `ls`) ist KEIN Skill — das macht Bash direkt.")
|
||||
lines.append("3. **Parametrisierbar** — der Skill nimmt Eingaben (URL, Datei, Suchbegriff) "
|
||||
"und gibt ein nuetzliches Ergebnis zurueck.")
|
||||
lines.append("4. **Wiederverwendbar als ganzes** — Stefan wuerde es zukuenftig per Name "
|
||||
"ansprechen (\"mach mir den YouTube zu MP3\") statt jedes Mal zu erklaeren.")
|
||||
lines.append("")
|
||||
lines.append("Wenn nichts installiert werden muss UND nicht alle vier zutreffen: einfach "
|
||||
"die Aufgabe loesen ohne Skill anzulegen. Stefan kann jederzeit sagen "
|
||||
"'bau daraus einen Skill'.")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_system_prompt(
|
||||
pinned: List[MemoryPoint],
|
||||
cold: List[MemoryPoint] | None = None,
|
||||
skills: List[dict] | None = None,
|
||||
) -> str:
|
||||
"""Kompletter System-Prompt: Hot + Cold + Skills."""
|
||||
parts = [build_hot_memory_section(pinned)]
|
||||
if skills:
|
||||
parts.append("")
|
||||
parts.append(build_skills_section(skills))
|
||||
if cold:
|
||||
parts.append("")
|
||||
parts.append(build_cold_memory_section(cold))
|
||||
return "\n".join(parts).strip()
|
||||
@@ -0,0 +1,144 @@
|
||||
"""
|
||||
Claude-Aufruf ueber den lokalen Proxy.
|
||||
|
||||
Der Proxy (claude-max-api-proxy) bietet eine OpenAI-kompatible API
|
||||
unter http://proxy:3456/v1/chat/completions. Wir nutzen non-streaming
|
||||
mit einem laengeren Timeout — Claude Code spawnt pro Anfrage einen
|
||||
neuen CLI-Prozess (Cold-Start), das dauert.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
import httpx
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
RUNTIME_CONFIG_FILE = Path("/shared/config/runtime.json")
|
||||
ENV_MODEL = os.environ.get("BRAIN_MODEL", "claude-sonnet-4")
|
||||
PROXY_URL = os.environ.get("PROXY_URL", "http://proxy:3456")
|
||||
PROXY_TIMEOUT_SEC = float(os.environ.get("PROXY_TIMEOUT_SEC", "300"))
|
||||
|
||||
|
||||
def _read_model_from_runtime() -> str:
|
||||
"""Liest brainModel aus runtime.json. Fallback: ENV BRAIN_MODEL."""
|
||||
try:
|
||||
if RUNTIME_CONFIG_FILE.exists():
|
||||
data = json.loads(RUNTIME_CONFIG_FILE.read_text(encoding="utf-8"))
|
||||
m = (data.get("brainModel") or "").strip()
|
||||
if m:
|
||||
return m
|
||||
except Exception as exc:
|
||||
logger.warning("runtime.json lesen fehlgeschlagen: %s", exc)
|
||||
return ENV_MODEL
|
||||
|
||||
|
||||
DEFAULT_MODEL = _read_model_from_runtime()
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
role: str # "system" | "user" | "assistant" | "tool"
|
||||
content: Optional[str] = None
|
||||
tool_calls: Optional[list] = None
|
||||
tool_call_id: Optional[str] = None
|
||||
name: Optional[str] = None # nur fuer role=tool
|
||||
|
||||
|
||||
class ProxyResult(BaseModel):
|
||||
content: str = ""
|
||||
tool_calls: list = [] # je: {"id", "name", "arguments" (dict)}
|
||||
finish_reason: str = ""
|
||||
|
||||
|
||||
class ProxyClient:
|
||||
def __init__(self, base_url: str = PROXY_URL, model: str = DEFAULT_MODEL):
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.model = model
|
||||
# Persistente Client-Connection — vermeidet TCP-Handshake bei jedem Call
|
||||
self._client = httpx.Client(timeout=PROXY_TIMEOUT_SEC)
|
||||
|
||||
def chat(self, messages: List[Message], model: Optional[str] = None) -> str:
|
||||
"""Convenience: einfacher Chat ohne Tools. Gibt nur den Reply-String zurueck."""
|
||||
result = self.chat_full(messages, tools=None, model=model)
|
||||
if not result.content:
|
||||
raise RuntimeError("Proxy lieferte leeren content")
|
||||
return result.content
|
||||
|
||||
def chat_full(
|
||||
self,
|
||||
messages: List[Message],
|
||||
tools: Optional[list] = None,
|
||||
model: Optional[str] = None,
|
||||
) -> ProxyResult:
|
||||
"""Full chat — kann Tool-Calls liefern (wenn tools mitgegeben).
|
||||
|
||||
tools-Format ist OpenAI-Style:
|
||||
[{"type":"function","function":{"name":..,"description":..,"parameters":{...}}}, ...]
|
||||
"""
|
||||
url = f"{self.base_url}/v1/chat/completions"
|
||||
# Pydantic-Dumps mit exclude_none damit role=tool ohne tool_calls geht
|
||||
payload = {
|
||||
"model": model or self.model,
|
||||
"messages": [m.model_dump(exclude_none=True) for m in messages],
|
||||
}
|
||||
if tools:
|
||||
payload["tools"] = tools
|
||||
logger.info("Proxy → %s (%d Messages, %d tools, model=%s)",
|
||||
url, len(messages), len(tools or []), payload["model"])
|
||||
try:
|
||||
r = self._client.post(url, json=payload)
|
||||
except httpx.RequestError as exc:
|
||||
raise RuntimeError(f"Proxy unreachable: {exc}") from exc
|
||||
if r.status_code != 200:
|
||||
raise RuntimeError(f"Proxy HTTP {r.status_code}: {r.text[:300]}")
|
||||
try:
|
||||
data = r.json()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Proxy invalid JSON: {exc}") from exc
|
||||
|
||||
choices = data.get("choices") or []
|
||||
if not choices:
|
||||
raise RuntimeError(f"Proxy ohne choices: {str(data)[:300]}")
|
||||
|
||||
msg = choices[0].get("message") or {}
|
||||
finish_reason = choices[0].get("finish_reason", "")
|
||||
|
||||
content = msg.get("content") or ""
|
||||
if isinstance(content, list):
|
||||
content = "".join(
|
||||
part.get("text", "") for part in content if isinstance(part, dict) and part.get("type") == "text"
|
||||
)
|
||||
|
||||
tool_calls_raw = msg.get("tool_calls") or []
|
||||
tool_calls = []
|
||||
import json as _json
|
||||
for tc in tool_calls_raw:
|
||||
fn = tc.get("function") or {}
|
||||
args_raw = fn.get("arguments", "{}")
|
||||
args: dict
|
||||
if isinstance(args_raw, dict):
|
||||
args = args_raw
|
||||
else:
|
||||
try:
|
||||
args = _json.loads(args_raw)
|
||||
except Exception:
|
||||
args = {"_raw": args_raw}
|
||||
tool_calls.append({
|
||||
"id": tc.get("id", ""),
|
||||
"name": fn.get("name", ""),
|
||||
"arguments": args,
|
||||
})
|
||||
|
||||
return ProxyResult(content=content or "", tool_calls=tool_calls, finish_reason=finish_reason)
|
||||
|
||||
def close(self):
|
||||
try:
|
||||
self._client.close()
|
||||
except Exception:
|
||||
pass
|
||||
@@ -0,0 +1,14 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.32.0
|
||||
pydantic==2.9.2
|
||||
httpx==0.27.2
|
||||
websockets==13.1
|
||||
|
||||
# Vector-DB
|
||||
qdrant-client==1.12.1
|
||||
|
||||
# Embeddings (laeuft auf CPU, ~120MB Modell)
|
||||
sentence-transformers==3.2.1
|
||||
|
||||
# Utility
|
||||
python-multipart==0.0.12
|
||||
@@ -0,0 +1,373 @@
|
||||
"""
|
||||
Skill-Manager — Filesystem-Layer fuer ARIAs Faehigkeiten.
|
||||
|
||||
Layout:
|
||||
/data/skills/<name>/
|
||||
skill.json - Manifest
|
||||
README.md - Beschreibung (vom Stil her: was, wann, wie aufrufen)
|
||||
run.sh - Entry-Point (sh, python -m, was auch immer)
|
||||
requirements.txt - optional, fuer local-venv
|
||||
venv/ - automatisch erzeugt bei local-venv
|
||||
bin/ - statische Binaries (fuer local-bin)
|
||||
logs/ - <ts>.json Run-Logs (append-only pro Run)
|
||||
|
||||
Manifest (skill.json):
|
||||
{
|
||||
"name": "youtube2mp3",
|
||||
"description": "Konvertiert YouTube-Video-URL zu MP3",
|
||||
"execution": "local-venv" | "local-bin" | "bash",
|
||||
"entry": "run.sh",
|
||||
"args": [{"name": "url", "required": true}, ...],
|
||||
"requires": {"pip": [...], "binaries": [...]},
|
||||
"active": true,
|
||||
"created_at": "ISO",
|
||||
"updated_at": "ISO",
|
||||
"last_used": null | "ISO",
|
||||
"use_count": 0,
|
||||
"version": "1.0",
|
||||
"author": "aria" | "stefan"
|
||||
}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SKILLS_DIR = Path(os.environ.get("SKILLS_DIR", "/data/skills"))
|
||||
SHARED_UPLOADS = Path("/shared/uploads")
|
||||
|
||||
VALID_EXECUTIONS = {"local-venv", "local-bin", "bash"}
|
||||
NAME_RE = re.compile(r"^[a-zA-Z0-9_-]{2,60}$")
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _safe_name(name: str) -> str:
|
||||
if not isinstance(name, str) or not NAME_RE.match(name):
|
||||
raise ValueError(f"Ungültiger Skill-Name: {name!r}")
|
||||
return name
|
||||
|
||||
|
||||
def _skill_dir(name: str) -> Path:
|
||||
return SKILLS_DIR / _safe_name(name)
|
||||
|
||||
|
||||
# ─── Listing ────────────────────────────────────────────────────────
|
||||
|
||||
def list_skills(active_only: bool = False) -> list[dict]:
|
||||
out: list[dict] = []
|
||||
if not SKILLS_DIR.exists():
|
||||
return out
|
||||
for entry in sorted(SKILLS_DIR.iterdir()):
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
manifest = read_manifest(entry.name)
|
||||
if manifest is None:
|
||||
continue
|
||||
if active_only and not manifest.get("active", True):
|
||||
continue
|
||||
out.append(manifest)
|
||||
return out
|
||||
|
||||
|
||||
def read_manifest(name: str) -> Optional[dict]:
|
||||
try:
|
||||
path = _skill_dir(name) / "skill.json"
|
||||
if not path.exists():
|
||||
return None
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
except Exception as exc:
|
||||
logger.warning("Manifest lesen %s: %s", name, exc)
|
||||
return None
|
||||
|
||||
|
||||
def write_manifest(name: str, manifest: dict) -> None:
|
||||
d = _skill_dir(name)
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
manifest["updated_at"] = _now()
|
||||
(d / "skill.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
|
||||
def read_readme(name: str) -> str:
|
||||
path = _skill_dir(name) / "README.md"
|
||||
return path.read_text(encoding="utf-8") if path.exists() else ""
|
||||
|
||||
|
||||
# ─── Create / Update / Delete ────────────────────────────────────────
|
||||
|
||||
def create_skill(
|
||||
name: str,
|
||||
description: str,
|
||||
execution: str,
|
||||
entry_code: str,
|
||||
readme: str = "",
|
||||
args: Optional[list] = None,
|
||||
requires: Optional[dict] = None,
|
||||
pip_packages: Optional[list[str]] = None,
|
||||
author: str = "aria",
|
||||
) -> dict:
|
||||
"""Legt einen neuen Skill an. Wirft ValueError bei ungueltigen Inputs.
|
||||
|
||||
entry_code wird je nach execution in run.sh oder run.py geschrieben.
|
||||
Bei local-venv wird sofort eine venv erzeugt + pip_packages installiert.
|
||||
"""
|
||||
name = _safe_name(name)
|
||||
if execution not in VALID_EXECUTIONS:
|
||||
raise ValueError(f"execution muss eines von {VALID_EXECUTIONS} sein")
|
||||
d = _skill_dir(name)
|
||||
if d.exists():
|
||||
raise ValueError(f"Skill '{name}' existiert bereits — erst loeschen oder updaten")
|
||||
|
||||
d.mkdir(parents=True)
|
||||
(d / "logs").mkdir()
|
||||
|
||||
# Entry-File: run.sh oder run.py
|
||||
if execution == "local-venv":
|
||||
entry_path = d / "run.py"
|
||||
entry_path.write_text(entry_code, encoding="utf-8")
|
||||
entry_name = "run.py"
|
||||
(d / "requirements.txt").write_text("\n".join(pip_packages or []) + "\n", encoding="utf-8")
|
||||
else:
|
||||
entry_path = d / "run.sh"
|
||||
# Shebang ergaenzen wenn nicht da
|
||||
content = entry_code if entry_code.startswith("#!") else "#!/usr/bin/env bash\nset -euo pipefail\n" + entry_code
|
||||
entry_path.write_text(content, encoding="utf-8")
|
||||
entry_path.chmod(0o755)
|
||||
entry_name = "run.sh"
|
||||
|
||||
# README
|
||||
(d / "README.md").write_text(readme or f"# {name}\n\n{description}\n", encoding="utf-8")
|
||||
|
||||
manifest = {
|
||||
"name": name,
|
||||
"description": description,
|
||||
"execution": execution,
|
||||
"entry": entry_name,
|
||||
"args": args or [],
|
||||
"requires": requires or {},
|
||||
"active": True,
|
||||
"created_at": _now(),
|
||||
"updated_at": _now(),
|
||||
"last_used": None,
|
||||
"use_count": 0,
|
||||
"version": "1.0",
|
||||
"author": author,
|
||||
}
|
||||
write_manifest(name, manifest)
|
||||
|
||||
# venv aufbauen bei local-venv
|
||||
if execution == "local-venv":
|
||||
try:
|
||||
_setup_venv(d, pip_packages or [])
|
||||
except Exception as exc:
|
||||
# venv-Aufbau fehlgeschlagen → Skill steht trotzdem im Repo, aber inaktiv
|
||||
manifest["active"] = False
|
||||
manifest["setup_error"] = str(exc)[:500]
|
||||
write_manifest(name, manifest)
|
||||
logger.warning("Skill %s: venv-Setup fehlgeschlagen → deaktiviert: %s", name, exc)
|
||||
|
||||
logger.info("Skill erstellt: %s (%s)", name, execution)
|
||||
return manifest
|
||||
|
||||
|
||||
def _setup_venv(skill_dir: Path, pip_packages: list[str]) -> None:
|
||||
venv = skill_dir / "venv"
|
||||
logger.info("venv erstellen: %s", venv)
|
||||
subprocess.run(["python", "-m", "venv", str(venv)], check=True, timeout=120)
|
||||
pip = venv / "bin" / "pip"
|
||||
if pip_packages:
|
||||
subprocess.run([str(pip), "install", "--no-cache-dir", *pip_packages], check=True, timeout=600)
|
||||
|
||||
|
||||
def update_skill(name: str, patch: dict) -> dict:
|
||||
manifest = read_manifest(name)
|
||||
if manifest is None:
|
||||
raise ValueError(f"Skill '{name}' nicht gefunden")
|
||||
allowed = {"description", "args", "requires", "active", "version", "entry"}
|
||||
for k, v in patch.items():
|
||||
if k in allowed:
|
||||
manifest[k] = v
|
||||
write_manifest(name, manifest)
|
||||
return manifest
|
||||
|
||||
|
||||
def delete_skill(name: str) -> None:
|
||||
d = _skill_dir(name)
|
||||
if not d.exists():
|
||||
raise ValueError(f"Skill '{name}' nicht gefunden")
|
||||
shutil.rmtree(d)
|
||||
logger.info("Skill geloescht: %s", name)
|
||||
|
||||
|
||||
# ─── Run ────────────────────────────────────────────────────────────
|
||||
|
||||
def run_skill(name: str, args: Optional[dict] = None, timeout_sec: int = 300) -> dict:
|
||||
"""Fuehrt einen Skill aus. Args werden als ENV-Vars uebergeben
|
||||
(Praefix ARG_, z.B. ARG_URL fuer args["url"]).
|
||||
|
||||
Returns: {ok, exit_code, stdout, stderr, duration_sec, log_path}
|
||||
"""
|
||||
manifest = read_manifest(name)
|
||||
if manifest is None:
|
||||
raise ValueError(f"Skill '{name}' nicht gefunden")
|
||||
if not manifest.get("active", True):
|
||||
raise ValueError(f"Skill '{name}' ist deaktiviert")
|
||||
|
||||
d = _skill_dir(name)
|
||||
entry = manifest.get("entry", "run.sh")
|
||||
exec_mode = manifest.get("execution", "bash")
|
||||
|
||||
env = os.environ.copy()
|
||||
# Skill-Args als ENV-Vars
|
||||
for k, v in (args or {}).items():
|
||||
if not re.match(r"^[a-zA-Z][a-zA-Z0-9_]*$", k):
|
||||
continue
|
||||
env[f"ARG_{k.upper()}"] = str(v)
|
||||
env["SKILL_DIR"] = str(d)
|
||||
env["SHARED_UPLOADS"] = str(SHARED_UPLOADS)
|
||||
|
||||
# Command bauen
|
||||
if exec_mode == "local-venv":
|
||||
python = d / "venv" / "bin" / "python"
|
||||
cmd = [str(python), str(d / entry)]
|
||||
elif exec_mode == "local-bin":
|
||||
# Skill bringt seine bin/ mit — wir prepended sie an den PATH
|
||||
env["PATH"] = f"{d / 'bin'}:{env.get('PATH', '')}"
|
||||
cmd = [str(d / entry)]
|
||||
else: # bash
|
||||
cmd = [str(d / entry)]
|
||||
|
||||
log_id = f"{int(time.time())}-{uuid.uuid4().hex[:8]}"
|
||||
log_path = d / "logs" / f"{log_id}.json"
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd, env=env, cwd=str(d),
|
||||
capture_output=True, text=True, timeout=timeout_sec,
|
||||
)
|
||||
out_text = proc.stdout
|
||||
err_text = proc.stderr
|
||||
exit_code = proc.returncode
|
||||
timed_out = False
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
out_text = exc.stdout or ""
|
||||
err_text = (exc.stderr or "") + f"\n[TIMEOUT {timeout_sec}s]"
|
||||
exit_code = -1
|
||||
timed_out = True
|
||||
duration = time.time() - t0
|
||||
|
||||
# Log schreiben (gekuerzt damit es nicht explodiert)
|
||||
record = {
|
||||
"ts": _now(),
|
||||
"args": args or {},
|
||||
"exit_code": exit_code,
|
||||
"duration_sec": round(duration, 2),
|
||||
"stdout": (out_text or "")[:8000],
|
||||
"stderr": (err_text or "")[:8000],
|
||||
"timed_out": timed_out,
|
||||
}
|
||||
try:
|
||||
log_path.write_text(json.dumps(record, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Stats updaten
|
||||
manifest["last_used"] = _now()
|
||||
manifest["use_count"] = int(manifest.get("use_count", 0)) + 1
|
||||
write_manifest(name, manifest)
|
||||
|
||||
record["ok"] = exit_code == 0
|
||||
record["log_path"] = str(log_path)
|
||||
return record
|
||||
|
||||
|
||||
def list_logs(name: str, limit: int = 50) -> list[dict]:
|
||||
d = _skill_dir(name) / "logs"
|
||||
if not d.exists():
|
||||
return []
|
||||
files = sorted(d.glob("*.json"), reverse=True)[:limit]
|
||||
out: list[dict] = []
|
||||
for f in files:
|
||||
try:
|
||||
data = json.loads(f.read_text(encoding="utf-8"))
|
||||
data["log_id"] = f.stem
|
||||
out.append(data)
|
||||
except Exception:
|
||||
continue
|
||||
return out
|
||||
|
||||
|
||||
# ─── Export / Import ────────────────────────────────────────────────
|
||||
|
||||
def export_skill(name: str) -> bytes:
|
||||
"""Packt einen Skill als tar.gz und gibt die Bytes zurueck.
|
||||
venv und logs werden ausgeschlossen (werden beim Import neu gebaut)."""
|
||||
import io
|
||||
import tarfile
|
||||
d = _skill_dir(name)
|
||||
if not d.exists():
|
||||
raise ValueError(f"Skill '{name}' nicht gefunden")
|
||||
buf = io.BytesIO()
|
||||
with tarfile.open(fileobj=buf, mode="w:gz") as tar:
|
||||
for path in d.iterdir():
|
||||
if path.name in ("venv", "logs", "__pycache__"):
|
||||
continue
|
||||
tar.add(path, arcname=f"{name}/{path.name}")
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def import_skill(tar_bytes: bytes, overwrite: bool = False) -> dict:
|
||||
"""Importiert einen Skill aus tar.gz. Liefert das Manifest zurueck."""
|
||||
import io
|
||||
import tarfile
|
||||
SKILLS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
with tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:gz") as tar:
|
||||
# Erst Root-Name finden (= Skill-Name)
|
||||
members = tar.getmembers()
|
||||
if not members:
|
||||
raise ValueError("Leeres Archiv")
|
||||
root = members[0].name.split("/", 1)[0]
|
||||
name = _safe_name(root)
|
||||
d = _skill_dir(name)
|
||||
if d.exists():
|
||||
if not overwrite:
|
||||
raise ValueError(f"Skill '{name}' existiert bereits — overwrite=true setzen")
|
||||
shutil.rmtree(d)
|
||||
# Extrahieren — Path-Traversal verhindern
|
||||
for m in members:
|
||||
target = SKILLS_DIR / m.name
|
||||
if not str(target.resolve()).startswith(str(SKILLS_DIR.resolve())):
|
||||
raise ValueError(f"Unsicherer Pfad im Archiv: {m.name}")
|
||||
tar.extractall(SKILLS_DIR)
|
||||
# logs-Verzeichnis anlegen falls fehlte
|
||||
(d / "logs").mkdir(exist_ok=True)
|
||||
# venv neu bauen falls local-venv
|
||||
manifest = read_manifest(name) or {}
|
||||
if manifest.get("execution") == "local-venv":
|
||||
req_file = d / "requirements.txt"
|
||||
pip_packages: list[str] = []
|
||||
if req_file.exists():
|
||||
pip_packages = [l.strip() for l in req_file.read_text().splitlines() if l.strip() and not l.startswith("#")]
|
||||
try:
|
||||
_setup_venv(d, pip_packages)
|
||||
except Exception as exc:
|
||||
logger.warning("Skill-Import %s: venv-Setup fehlgeschlagen: %s", name, exc)
|
||||
manifest["active"] = False
|
||||
manifest["setup_error"] = str(exc)[:500]
|
||||
write_manifest(name, manifest)
|
||||
return manifest
|
||||
@@ -52,15 +52,61 @@ Fuer Web-Anfragen: **WebFetch** oder **Bash mit curl**. Niemals sagen "ich habe
|
||||
4. **Regelmaessig committen** — mit sinnvollen Commit-Messages.
|
||||
5. **Tageslog fuehren** — was wurde getan, was ist offen.
|
||||
|
||||
## Dateien an Stefan zurueckgeben — KRITISCH
|
||||
|
||||
**Das ist die EINZIGE Methode wie Stefan an Dateien rankommt. Ohne
|
||||
diese Schritte sieht und bekommt er die Datei NICHT.**
|
||||
|
||||
### Regel 1 — Speicher-Ort
|
||||
|
||||
Dateien fuer Stefan AUSSCHLIESSLICH unter `/shared/uploads/` speichern.
|
||||
|
||||
NIEMALS in:
|
||||
- `/home/node/.openclaw/workspace/...` (das ist NUR dein Arbeitsverzeichnis,
|
||||
Stefan hat keinen Zugriff darauf)
|
||||
- `/tmp/...`, `/root/...`, oder sonst irgendwo
|
||||
|
||||
Dateinamen mit `aria_`-Prefix damit Cleanup-Scripts sie zuordnen koennen:
|
||||
|
||||
```
|
||||
/shared/uploads/aria_<beschreibender_name>.<ext>
|
||||
```
|
||||
|
||||
Beispiele: `aria_termin_zusage.pdf`, `aria_einkaufsliste.md`,
|
||||
`aria_logs_2026-05-10.zip`.
|
||||
|
||||
### Regel 2 — Marker im Antworttext
|
||||
|
||||
Am Ende deiner Antwort EINMALIG den Marker setzen:
|
||||
|
||||
```
|
||||
[FILE: /shared/uploads/aria_<name>.<ext>]
|
||||
```
|
||||
|
||||
OHNE diesen Marker erscheint die Datei NICHT in der App / Diagnostic.
|
||||
|
||||
Mehrere Dateien: mehrere `[FILE: ...]`-Marker am Ende, jeder in
|
||||
eigener Zeile.
|
||||
|
||||
### Beispiel — kompletter Workflow
|
||||
|
||||
User: "Schreib mir ein Lasagne-Rezept als md-Datei"
|
||||
|
||||
1. Du schreibst die Datei: `Write` Tool mit Pfad `/shared/uploads/aria_lasagne.md`
|
||||
2. Antwort an Stefan:
|
||||
|
||||
```
|
||||
Hier dein Lasagne-Rezept — Ragu am Vortag, echter Parmesan,
|
||||
Ruhezeit nicht skippen. Beim Schichten Bechamel auf jede Lage.
|
||||
|
||||
[FILE: /shared/uploads/aria_lasagne.md]
|
||||
```
|
||||
|
||||
Der Marker wird automatisch aus dem sichtbaren Text entfernt und
|
||||
als Anhang-Bubble angezeigt. Stefan tippt drauf → oeffnet die Datei.
|
||||
|
||||
## Stimme
|
||||
|
||||
| Stimme | Modell | Wann |
|
||||
|--------|--------|------|
|
||||
| **Ramona** (weiblich) | `de_DE-ramona-low` | Alltag, Antworten, Gespraeche (Standard) |
|
||||
| **Thorsten** (maennlich, tief) | `de_DE-thorsten-high` | Epische Momente, Alarme, besondere Ereignisse |
|
||||
|
||||
**Thorsten spricht bei:**
|
||||
- Build erfolgreich deployed
|
||||
- Ticket geloest / Aufgabe abgeschlossen
|
||||
- Kritischer Alarm (Server down, Sicherheitswarnung)
|
||||
- Wenn Stefan sagt "So soll es sein"
|
||||
TTS laeuft ueber F5-TTS (Voice Cloning, Gaming-PC). Stefan kann eigene
|
||||
Stimmen aus Audio-Samples klonen (Diagnostic → Stimmen → Stimme klonen)
|
||||
und in App + Diagnostic auswaehlen.
|
||||
@@ -78,12 +78,101 @@ Wenn ein Tool nicht klappt, probiere die Alternative. Niemals sagen "ich habe ke
|
||||
- Destruktive Operationen (Dateien loeschen, Datenbanken droppen)
|
||||
- Push auf main
|
||||
|
||||
## Dateien an Stefan zurueckgeben — KRITISCH
|
||||
|
||||
**Das ist die EINZIGE Methode wie Stefan an Dateien rankommt. Ohne diese
|
||||
Schritte sieht und bekommt er die Datei NICHT.**
|
||||
|
||||
### Regel 1 — Speicher-Ort
|
||||
|
||||
Dateien fuer Stefan AUSSCHLIESSLICH unter `/shared/uploads/` speichern.
|
||||
|
||||
NIEMALS in:
|
||||
- `/home/node/.openclaw/workspace/...` (NUR dein Arbeitsverzeichnis,
|
||||
Stefan hat keinen Zugriff)
|
||||
- `/tmp/...`, `/root/...`, oder sonst irgendwo
|
||||
|
||||
Dateinamen mit `aria_`-Prefix:
|
||||
|
||||
```
|
||||
/shared/uploads/aria_<beschreibender_name>.<ext>
|
||||
```
|
||||
|
||||
Beispiele: `aria_termin_zusage.pdf`, `aria_einkaufsliste.md`,
|
||||
`aria_logs_2026-05-10.zip`.
|
||||
|
||||
### Regel 2 — Marker im Antworttext
|
||||
|
||||
Am Ende deiner Antwort EINMALIG den Marker setzen:
|
||||
|
||||
```
|
||||
[FILE: /shared/uploads/aria_<name>.<ext>]
|
||||
```
|
||||
|
||||
OHNE diesen Marker erscheint die Datei NICHT in der App / Diagnostic.
|
||||
|
||||
Mehrere Dateien: mehrere `[FILE: ...]`-Marker am Ende, jeder in
|
||||
eigener Zeile.
|
||||
|
||||
**WICHTIG — Datei MUSS existieren bevor du den Marker setzt.**
|
||||
Marker fuer nicht-existente Pfade werden silent gefiltert + Stefan
|
||||
bekommt einen Hinweis dass du eine Datei versprochen aber nicht
|
||||
erstellt hast. Wenn du z.B. eine MIDI-Datei nicht generieren kannst,
|
||||
sag das offen statt nur den Marker zu setzen. Verifiziere zur Not
|
||||
mit `Bash` + `ls -la /shared/uploads/aria_<name>.<ext>` dass die
|
||||
Datei wirklich da ist.
|
||||
|
||||
### Beispiel — kompletter Workflow
|
||||
|
||||
User: "Schreib mir ein Lasagne-Rezept als md-Datei"
|
||||
|
||||
1. Du schreibst: `Write` Tool mit Pfad `/shared/uploads/aria_lasagne.md`
|
||||
2. Antwort an Stefan:
|
||||
|
||||
```
|
||||
Hier dein Lasagne-Rezept — Ragu am Vortag, echter Parmesan,
|
||||
Ruhezeit nicht skippen. Beim Schichten Bechamel auf jede Lage.
|
||||
|
||||
[FILE: /shared/uploads/aria_lasagne.md]
|
||||
```
|
||||
|
||||
Der Marker wird automatisch aus dem sichtbaren Text entfernt und
|
||||
als Anhang-Bubble angezeigt. Stefan tippt drauf → oeffnet die Datei
|
||||
im jeweiligen Standard-Programm.
|
||||
|
||||
### Externe Bilder/Dateien — IMMER runterladen, nicht nur verlinken
|
||||
|
||||
Wenn Stefan ein Bild oder eine Datei aus dem Netz haben will (Wikipedia,
|
||||
Wiki Commons, ein Beispiel-PDF, etc.):
|
||||
|
||||
NICHT NUR die URL in die Antwort schreiben — das Bild ist dann nur
|
||||
solange sichtbar wie der externe Server lebt.
|
||||
|
||||
STATTDESSEN:
|
||||
1. Mit `Bash` + curl/wget herunterladen nach `/shared/uploads/aria_<name>.<ext>`
|
||||
2. Mit `[FILE: ...]`-Marker als Anhang ausspielen
|
||||
|
||||
Beispiel — User: "Zeig mir ein Bild von Micky Maus"
|
||||
|
||||
```bash
|
||||
curl -sL "https://upload.wikimedia.org/wikipedia/commons/7/7f/Mickey_Mouse.svg" \
|
||||
-o /shared/uploads/aria_mickey_mouse.svg
|
||||
```
|
||||
|
||||
Antwort:
|
||||
```
|
||||
Hier Micky Maus — offizielles SVG von Wikimedia Commons (Public Domain).
|
||||
|
||||
[FILE: /shared/uploads/aria_mickey_mouse.svg]
|
||||
```
|
||||
|
||||
So bleibt das Bild permanent im Chat-Verlauf, auch wenn die Wiki-URL
|
||||
spaeter offline geht oder umgezogen wird.
|
||||
|
||||
## Stimme
|
||||
|
||||
| Stimme | Modell | Wann |
|
||||
|--------|--------|------|
|
||||
| **Ramona** (weiblich) | `de_DE-ramona-low` | Alltag, Antworten, Gespraeche (Standard) |
|
||||
| **Thorsten** (maennlich, tief) | `de_DE-thorsten-high` | Epische Momente, Alarme, besondere Ereignisse |
|
||||
TTS laeuft ueber F5-TTS auf der Gamebox (Voice Cloning). Stefan kann
|
||||
eigene Stimmen aus Audio-Samples klonen und in App/Diagnostic auswaehlen.
|
||||
|
||||
## Gedaechtnis (Memory)
|
||||
|
||||
@@ -147,4 +236,4 @@ Danach den Eintrag in `memory/MEMORY.md` (Index) verlinken.
|
||||
### Netzwerk
|
||||
- **aria-net:** Internes Docker-Netz (proxy, aria-core)
|
||||
- **RVS:** Rendezvous-Server im Rechenzentrum — Relay fuer die Android-App
|
||||
- **Bridge:** Voice Bridge (Whisper STT + Piper TTS) — teilt Netzwerk mit aria-core
|
||||
- **Bridge:** Voice Bridge (orchestriert STT/TTS via Gamebox-Bridges) — teilt Netzwerk mit aria-core
|
||||
@@ -1,10 +1,10 @@
|
||||
# Stefan — Benutzer-Praeferenzen
|
||||
# <Username> — Benutzer-Praeferenzen
|
||||
|
||||
## Allgemein
|
||||
|
||||
- **Sprache:** Deutsch
|
||||
- **Kommunikation:** Direkt, kein Bullshit, Humor willkommen
|
||||
- **Rolle:** Chef, Auftraggeber, Entwickler bei HackerSoft Oldenburg
|
||||
- **Sprache:** <z.B. Deutsch>
|
||||
- **Kommunikation:** <z.B. Direkt, kein Bullshit, Humor willkommen>
|
||||
- **Rolle:** <z.B. Chef, Auftraggeber, Entwickler bei XYZ>
|
||||
|
||||
## Bestaetigung erforderlich fuer
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
- Push auf main
|
||||
- Aenderungen an Kundensystemen
|
||||
- Server-Befehle die nicht rueckgaengig gemacht werden koennen
|
||||
- Windows neu installieren (erst Daten sichern!)
|
||||
|
||||
## Autonomes Arbeiten OK fuer
|
||||
|
||||
@@ -28,8 +27,10 @@
|
||||
|
||||
| Tool | Zweck |
|
||||
|------|-------|
|
||||
| **Proxmox** | VM-Infrastruktur (ARIAs Zuhause) |
|
||||
| **Gitea** | Code-Hosting (gitea.hackersoft.de) |
|
||||
| **OpenCRM** | Kundenverwaltung |
|
||||
| **STARFACE** | Telefonie |
|
||||
| **RustDesk** | Remote IT-Support bei Kunden |
|
||||
| **<Beispiel-Tool>** | <Zweck> |
|
||||
|
||||
<!--
|
||||
Diese Datei ist eine Vorlage. Lokal als USER.md kopieren und mit
|
||||
eigenen Praeferenzen + Tool-Stack fuellen. USER.md selbst ist via
|
||||
.gitignore vom Repo ausgeschlossen.
|
||||
-->
|
||||
@@ -1,18 +0,0 @@
|
||||
# Bridge → aria-core (OpenClaw Gateway)
|
||||
# Bridge teilt Netzwerk mit aria-core (network_mode: service:aria)
|
||||
# → localhost ist aria-core
|
||||
ARIA_CORE_WS=ws://127.0.0.1:18789
|
||||
|
||||
# Piper TTS Stimmen
|
||||
PIPER_RAMONA=/voices/de_DE-ramona-low.onnx
|
||||
PIPER_THORSTEN=/voices/de_DE-thorsten-high.onnx
|
||||
|
||||
# Wake-Word
|
||||
WAKE_WORD=aria
|
||||
|
||||
# Whisper STT — wird zur Laufzeit in der Diagnostic (Sektion "Whisper") umgeschaltet
|
||||
# und in /shared/config/voice_config.json gespeichert. Der Wert hier ist nur der
|
||||
# Initial-Default beim ersten Start.
|
||||
# Optionen: tiny | base | small | medium | large-v3
|
||||
WHISPER_MODEL=medium
|
||||
WHISPER_LANGUAGE=de
|
||||
@@ -1,11 +0,0 @@
|
||||
{
|
||||
"version": 1,
|
||||
"profiles": {
|
||||
"openai-proxy": {
|
||||
"provider": "openai",
|
||||
"default": true,
|
||||
"apiKey": "not-needed",
|
||||
"baseUrl": "http://proxy:3456/v1"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +0,0 @@
|
||||
# OpenClaw (aria-core) Konfiguration
|
||||
# Diese Datei wird als /workspace/.env in den Container gemountet
|
||||
#
|
||||
# WICHTIG: ANTHROPIC_API_KEY und ANTHROPIC_BASE_URL absichtlich NICHT gesetzt!
|
||||
# OpenClaw wuerde sonst die echte Anthropic API direkt anrufen (401 weil kein API Key).
|
||||
# Stattdessen nur den OpenAI-kompatiblen Proxy nutzen.
|
||||
@@ -1,137 +0,0 @@
|
||||
# OpenClaw Tool-Permissions — Stand 2026-03-15
|
||||
|
||||
## Das Problem (GELÖST)
|
||||
|
||||
ARIA hat ZWEI Tool-Systeme gleichzeitig: Claude Code Tools UND OpenClaw-native Tools.
|
||||
Das Model hat aber nur Zugriff auf **Claude Code Tools** (über den Proxy), nicht auf OpenClaw-native Tools.
|
||||
|
||||
### Root Cause: DREI Probleme gleichzeitig
|
||||
|
||||
```
|
||||
OpenClaw (aria-core) → API Request → claude-max-api-proxy (aria-proxy) → Claude Code CLI (--print Mode)
|
||||
↓
|
||||
Tools: WebFetch, Bash, etc. (Claude Code)
|
||||
NICHT: web_fetch, exec (OpenClaw-nativ)
|
||||
```
|
||||
|
||||
**Problem 1: Proxy benutzt `--print` Modus**
|
||||
- `claude-max-api-proxy` ruft Claude Code CLI mit `--print --output-format stream-json` auf
|
||||
- Der Prompt wird als einziger String übergeben, keine Tool-Definitionen von OpenClaw
|
||||
- Das Model sieht NUR Claude Code's eingebaute Tools (WebFetch, Bash, etc.)
|
||||
- OpenClaw-native Tools (web_fetch, exec) existieren NUR auf Gateway-Ebene, kommen nie beim Model an
|
||||
|
||||
**Problem 2: BOOTSTRAP.md hat die falschen Tools angewiesen**
|
||||
- BOOTSTRAP.md sagte: "NIEMALS WebFetch benutzen, stattdessen web_fetch"
|
||||
- Aber web_fetch existiert nicht im Claude Code CLI Kontext
|
||||
- Und WebFetch war das einzige Tool das funktioniert hätte
|
||||
- → Model hatte keine Tools die es benutzen "durfte"
|
||||
|
||||
**Problem 3: settings.json im Proxy war leer**
|
||||
- `/root/.claude/settings.json` enthielt nur `{}` (keine Permissions)
|
||||
- Claude Code CLI im headless-Modus kann keine Tool-Genehmigungen erteilen
|
||||
- → Selbst wenn das Model WebFetch benutzen wollte, war es nicht vorab genehmigt
|
||||
|
||||
## Die Lösung
|
||||
|
||||
### Fix 1: BOOTSTRAP.md + AGENT.md umgeschrieben
|
||||
|
||||
**Vorher (FALSCH):**
|
||||
- "NIEMALS WebFetch benutzen — hat Permission-Probleme"
|
||||
- "Benutze web_fetch (OpenClaw-nativ)"
|
||||
|
||||
**Nachher (KORREKT):**
|
||||
- "WebFetch — URLs abrufen, Webseiten lesen, APIs aufrufen, Wetter abfragen"
|
||||
- "Bash — Shell-Befehle ausfuehren (curl, ssh, docker, etc.)"
|
||||
- "Niemals sagen 'ich habe keinen Zugriff' — du hast Zugriff auf alles"
|
||||
|
||||
### Fix 2: `CLAUDE_CODE_BUBBLEWRAP=1` + `--dangerously-skip-permissions`
|
||||
|
||||
**Der Schlüssel-Fix.** Zwei Zeilen in `docker-compose.yml`:
|
||||
|
||||
```yaml
|
||||
# 1. sed-Patch: --dangerously-skip-permissions in manager.js einfügen
|
||||
sed -i 's/"--no-session-persistence",/"--no-session-persistence","--dangerously-skip-permissions",/' $$DIST/subprocess/manager.js &&
|
||||
|
||||
# 2. Environment-Variable: Root-Check umgehen
|
||||
environment:
|
||||
- CLAUDE_CODE_BUBBLEWRAP=1
|
||||
```
|
||||
|
||||
**Warum beides nötig:**
|
||||
- `--dangerously-skip-permissions` umgeht alle Tool-Permission-Checks in Claude Code CLI
|
||||
- Aber: Claude Code CLI blockiert dieses Flag wenn es als root läuft
|
||||
- `CLAUDE_CODE_BUBBLEWRAP=1` überspringt den Root-Check (gefunden im minifizierten `cli.js`)
|
||||
- Proxy-Container (`node:22-alpine`) läuft als root → ohne BUBBLEWRAP geht's nicht
|
||||
|
||||
**Resultierende CLI-Argumente:**
|
||||
```
|
||||
claude --print --output-format stream-json --verbose --include-partial-messages \
|
||||
--model opus --no-session-persistence --dangerously-skip-permissions "prompt"
|
||||
```
|
||||
|
||||
## Wie der Proxy intern funktioniert
|
||||
|
||||
```
|
||||
openai-to-cli.js: OpenAI Messages → einzelner Prompt-String
|
||||
system → <system>...</system>
|
||||
user → direkt
|
||||
assistant → <previous_response>...</previous_response>
|
||||
|
||||
subprocess/manager.js: Spawnt `claude --print ... --dangerously-skip-permissions "{prompt}"`
|
||||
|
||||
cli-to-openai.js: Claude CLI JSON-Stream → OpenAI Chat Completion Chunks
|
||||
```
|
||||
|
||||
Der Proxy leitet KEINE Tool-Definitionen von OpenClaw weiter.
|
||||
Tool-Calls passieren INTERN in der Claude Code CLI und sind für OpenClaw transparent.
|
||||
|
||||
## Permission-Architektur
|
||||
|
||||
**Granulare Tool-Kontrolle ist NICHT möglich.** Es ist Alles-oder-Nichts:
|
||||
- `--dangerously-skip-permissions` AN → ARIA kann alle Claude Code Tools benutzen
|
||||
- `--dangerously-skip-permissions` AUS → ARIA kann keine Tools benutzen
|
||||
|
||||
OpenClaw's eigene Permissions (`tools.allow/deny` in `openclaw.json`) haben **keinen Effekt** auf die
|
||||
Claude Code Tools — die laufen komplett auf Proxy-Seite.
|
||||
|
||||
## Was NICHT funktioniert hat (17 Versuche)
|
||||
|
||||
1. **settings.json in aria-core** — OpenClaw benutzt NICHT Claude Code's settings.json
|
||||
2. **tools.allow mit PascalCase** (WebFetch, Grep) — OpenClaw kennt diese Namen nicht
|
||||
3. **tools.allow mit snake_case** (web_fetch) — Nur exec, read, write, edit erkannt
|
||||
4. **tools.allow mit Wildcard** `["*"]` — Hat nicht geholfen
|
||||
5. **tools.allow leer + tools.profile: "full"** — Nur ohne andere Fehler
|
||||
6. **System-Prompt Anweisung allein** — Reicht nicht wenn Tools blockiert sind
|
||||
7. **exec-approvals Wildcard allein** — Reicht nicht bei Config-Validation-Error
|
||||
8. **`openclaw config unset tools.exec.ask`** — CLI kennt den Pfad nicht
|
||||
9. **BOOTSTRAP.md mit OpenClaw-Tool-Namen** — Tools existieren nur auf Gateway-Ebene
|
||||
10. **settings.json im Proxy ohne BOOTSTRAP.md Fix** — BOOTSTRAP.md verbot die Tools
|
||||
11. **tools.byProvider.proxy.profile full** — Kein Effekt
|
||||
12. **settings.json + BOOTSTRAP.md ohne --dangerously-skip-permissions** — `--print` ignoriert settings.json
|
||||
13. **Manuelles `docker exec sed`** — Wird bei jedem Restart überschrieben
|
||||
14. **`--dangerously-skip-permissions` ohne BUBBLEWRAP** — Root-Check blockiert
|
||||
15. **`--allowedTools`** — Variadisches Argument frisst den Prompt
|
||||
16. **`--permission-mode bypassPermissions`** — Gleicher Root-Check
|
||||
17. **Non-Root User (`su node`)** — Auth-Pfad-Probleme, Credentials unerreichbar
|
||||
|
||||
## Wichtige Pfade
|
||||
|
||||
### aria-core (OpenClaw)
|
||||
- `/home/node/.openclaw/openclaw.json` — OpenClaw Haupt-Config
|
||||
- `/home/node/.openclaw/exec-approvals.json` — Exec Approvals
|
||||
- `/tmp/openclaw/openclaw-YYYY-MM-DD.log` — Tages-Log
|
||||
|
||||
### aria-proxy (Claude Code CLI)
|
||||
- `/root/.claude/.credentials.json` — Auth Credentials (NICHT in /root/.config/claude/)
|
||||
- `/usr/local/lib/node_modules/claude-max-api-proxy/dist/` — Proxy Source
|
||||
- `/usr/local/lib/node_modules/@anthropic-ai/claude-code/cli.js` — Claude Code CLI (enthält Root-Check)
|
||||
|
||||
## OpenClaw CLI Referenz
|
||||
|
||||
```bash
|
||||
openclaw config get/set/unset <path> # Config verwalten
|
||||
openclaw approvals get # Exec-Approvals anzeigen
|
||||
openclaw approvals allowlist add # Exec-Pattern freigeben
|
||||
openclaw doctor [--fix] # Health Check
|
||||
openclaw gateway status # Gateway-Status
|
||||
```
|
||||
+29
-118
@@ -1,104 +1,23 @@
|
||||
#!/bin/bash
|
||||
# ════════════════════════════════════════════════
|
||||
# ARIA — Ersteinrichtung nach docker compose up
|
||||
# Einmalig ausfuehren, danach persistiert alles.
|
||||
#
|
||||
# OpenClaw (aria-core) ist abgerissen — das Setup macht jetzt
|
||||
# nur noch den SSH-Key fuer den Zugriff auf die VM (aria-wohnung).
|
||||
# Brain + Proxy teilen sich denselben Key, beide haben aria-data/ssh
|
||||
# als Volume gemountet.
|
||||
# ════════════════════════════════════════════════
|
||||
set -e
|
||||
|
||||
echo "=== ARIA Setup ==="
|
||||
echo ""
|
||||
|
||||
# Warten bis aria-core laeuft
|
||||
echo "[1/7] Warte auf aria-core..."
|
||||
until docker inspect -f '{{.State.Running}}' aria-core 2>/dev/null | grep -q true; do
|
||||
sleep 2
|
||||
echo " ... warte..."
|
||||
done
|
||||
echo " aria-core laeuft."
|
||||
|
||||
# Permissions fixen — Docker-Volumes gehoeren root, OpenClaw laeuft als node
|
||||
echo ""
|
||||
echo "[2/7] Fixe Permissions auf /home/node/.openclaw und /home/node/.claude..."
|
||||
docker exec -u root aria-core chown -R node:node /home/node/.openclaw
|
||||
docker exec -u root aria-core chown -R node:node /home/node/.claude 2>/dev/null || true
|
||||
docker exec -u root aria-core chmod 700 /home/node/.openclaw
|
||||
echo " Permissions OK."
|
||||
|
||||
# OpenClaw Config schreiben — Custom Provider fuer claude-max-api-proxy
|
||||
echo ""
|
||||
echo "[3/7] Schreibe openclaw.json (Proxy-Provider + Model + Tools)..."
|
||||
docker exec aria-core sh -c 'cat > /home/node/.openclaw/openclaw.json << '"'"'INNEREOF'"'"'
|
||||
{
|
||||
"meta": {
|
||||
"lastTouchedVersion": "2026.3.8"
|
||||
},
|
||||
"gateway": {
|
||||
"mode": "local"
|
||||
},
|
||||
"agents": {
|
||||
"defaults": {
|
||||
"model": {
|
||||
"primary": "proxy/claude-sonnet-4"
|
||||
},
|
||||
"compaction": {
|
||||
"mode": "safeguard"
|
||||
},
|
||||
"timeoutSeconds": 900,
|
||||
"maxConcurrent": 4,
|
||||
"subagents": {
|
||||
"maxConcurrent": 8
|
||||
}
|
||||
}
|
||||
},
|
||||
"models": {
|
||||
"providers": {
|
||||
"proxy": {
|
||||
"api": "openai-completions",
|
||||
"baseUrl": "http://proxy:3456/v1",
|
||||
"apiKey": "not-needed",
|
||||
"models": [
|
||||
{ "id": "claude-sonnet-4", "name": "claude-sonnet-4" },
|
||||
{ "id": "claude-opus-4", "name": "claude-opus-4" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"tools": {
|
||||
"profile": "full",
|
||||
"web": {
|
||||
"fetch": {
|
||||
"enabled": true
|
||||
}
|
||||
},
|
||||
"exec": {
|
||||
"host": "gateway"
|
||||
}
|
||||
},
|
||||
"messages": {
|
||||
"ackReactionScope": "all"
|
||||
},
|
||||
"commands": {
|
||||
"native": "auto",
|
||||
"nativeSkills": "auto",
|
||||
"restart": true,
|
||||
"ownerDisplay": "raw"
|
||||
}
|
||||
}
|
||||
INNEREOF'
|
||||
echo " Config geschrieben."
|
||||
|
||||
# Exec-Approvals Wildcard — erlaubt Tool-Ausfuehrung im headless-Modus
|
||||
echo ""
|
||||
echo "[4/7] Setze exec-approvals Wildcard..."
|
||||
docker exec aria-core openclaw approvals allowlist add --agent "*" "*" 2>/dev/null || true
|
||||
echo " Approvals gesetzt."
|
||||
|
||||
# SSH-Key generieren fuer VM-Zugriff
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
SSH_DIR="$SCRIPT_DIR/aria-data/ssh"
|
||||
echo ""
|
||||
echo "[5/7] SSH-Key fuer VM-Zugriff..."
|
||||
|
||||
echo "=== ARIA Setup ==="
|
||||
|
||||
mkdir -p "$SSH_DIR"
|
||||
|
||||
if [ ! -f "$SSH_DIR/id_ed25519" ]; then
|
||||
echo "Generiere SSH-Key fuer aria-wohnung..."
|
||||
ssh-keygen -t ed25519 -f "$SSH_DIR/id_ed25519" -N "" -C "aria@aria-wohnung"
|
||||
cat > "$SSH_DIR/config" << 'SSHEOF'
|
||||
Host aria-wohnung
|
||||
@@ -108,34 +27,26 @@ Host aria-wohnung
|
||||
StrictHostKeyChecking accept-new
|
||||
SSHEOF
|
||||
chmod 600 "$SSH_DIR/id_ed25519"
|
||||
chmod 644 "$SSH_DIR/id_ed25519.pub"
|
||||
chmod 644 "$SSH_DIR/config"
|
||||
echo " Key generiert."
|
||||
# Public Key direkt in root's authorized_keys eintragen (Script laeuft als root auf der VM)
|
||||
mkdir -p /root/.ssh
|
||||
chmod 700 /root/.ssh
|
||||
cat "$SSH_DIR/id_ed25519.pub" >> /root/.ssh/authorized_keys
|
||||
chmod 600 /root/.ssh/authorized_keys
|
||||
echo " Public Key in /root/.ssh/authorized_keys eingetragen."
|
||||
chmod 644 "$SSH_DIR/id_ed25519.pub" "$SSH_DIR/config"
|
||||
|
||||
# Public Key direkt in /root/.ssh/authorized_keys eintragen
|
||||
# (Script laeuft als root auf der VM aria-wohnung)
|
||||
if [ -w /root/.ssh ] || [ -w /root ]; then
|
||||
mkdir -p /root/.ssh
|
||||
chmod 700 /root/.ssh
|
||||
cat "$SSH_DIR/id_ed25519.pub" >> /root/.ssh/authorized_keys
|
||||
chmod 600 /root/.ssh/authorized_keys
|
||||
echo " Public Key in /root/.ssh/authorized_keys eingetragen."
|
||||
else
|
||||
echo " Hinweis: konnte /root/.ssh/authorized_keys nicht schreiben."
|
||||
echo " Pubkey manuell eintragen:"
|
||||
cat "$SSH_DIR/id_ed25519.pub"
|
||||
fi
|
||||
else
|
||||
echo " Key existiert bereits."
|
||||
echo "SSH-Key existiert bereits — uebersprungen."
|
||||
fi
|
||||
|
||||
# Permissions im Container fixen
|
||||
echo ""
|
||||
echo "[6/7] Fixe SSH-Permissions..."
|
||||
docker exec -u root aria-core chown -R node:node /home/node/.ssh 2>/dev/null || true
|
||||
|
||||
# Neustart damit Gateway die Config laedt
|
||||
echo ""
|
||||
echo "[7/7] Starte aria-core neu..."
|
||||
docker restart aria-core
|
||||
|
||||
echo ""
|
||||
echo "=== Setup fertig ==="
|
||||
echo ""
|
||||
echo "Teste mit: docker logs aria-core --tail 20"
|
||||
echo "Erwartete Zeile: 'agent model: proxy/claude-sonnet-4'"
|
||||
echo ""
|
||||
echo "SSH-Test: docker exec aria-core ssh aria-wohnung hostname"
|
||||
echo "Tool-Test: Neue Session anlegen, dann 'Wie wird das Wetter in Bremen?' fragen"
|
||||
echo "Naechster Schritt: docker compose up -d"
|
||||
echo "Test: docker exec aria-brain ssh aria-wohnung hostname"
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
# ════════════════════════════════════════════════
|
||||
# ARIA Voice Bridge — Dockerfile
|
||||
# Whisper STT + Piper TTS + Wake-Word
|
||||
# Whisper STT + Wake-Word (TTS via XTTS v2 remote)
|
||||
# ════════════════════════════════════════════════
|
||||
|
||||
FROM python:3.12-slim
|
||||
|
||||
+1066
-474
File diff suppressed because it is too large
Load Diff
@@ -91,6 +91,39 @@ _ACTIVATION_MAP: dict[str, Mode] = {
|
||||
mode.config.activation_phrase.lower(): mode for mode in Mode
|
||||
}
|
||||
|
||||
# ID-Mapping fuer API-Mode-Wechsel (z.B. App ModeSelector schickt 'normal')
|
||||
_ID_MAP: dict[str, Mode] = {
|
||||
"normal": Mode.NORMAL,
|
||||
"nicht_stoeren": Mode.DND,
|
||||
"dnd": Mode.DND,
|
||||
"fluester": Mode.WHISPER,
|
||||
"whisper": Mode.WHISPER,
|
||||
"hangar": Mode.HANGAR,
|
||||
"gaming": Mode.GAMING,
|
||||
}
|
||||
|
||||
|
||||
def mode_from_id(mode_id: str) -> Optional[Mode]:
|
||||
"""ID-basiertes Mapping fuer API-Mode-Wechsel (ohne Aktivierungsphrase)."""
|
||||
if not mode_id:
|
||||
return None
|
||||
return _ID_MAP.get(mode_id.strip().lower())
|
||||
|
||||
|
||||
# Kanonische IDs fuer Broadcasts (matchen die App-UI-IDs in ModeSelector)
|
||||
_CANONICAL_ID: dict[Mode, str] = {
|
||||
Mode.NORMAL: "normal",
|
||||
Mode.DND: "nicht_stoeren",
|
||||
Mode.WHISPER: "fluester",
|
||||
Mode.HANGAR: "hangar",
|
||||
Mode.GAMING: "gaming",
|
||||
}
|
||||
|
||||
|
||||
def canonical_id(mode: Mode) -> str:
|
||||
"""Kanonische ID die App + Diagnostic + Bridge gleichermassen kennen."""
|
||||
return _CANONICAL_ID.get(mode, mode.name.lower())
|
||||
|
||||
|
||||
def detect_mode_switch(text: str) -> Optional[Mode]:
|
||||
"""Erkennt ob ein Text eine Modus-Umschaltung enthaelt.
|
||||
|
||||
@@ -5,8 +5,7 @@
|
||||
# STT — Whisper (lokal, keine API noetig)
|
||||
faster-whisper
|
||||
|
||||
# TTS — Piper (offline, deutsche Stimmen)
|
||||
piper-tts
|
||||
# TTS: laeuft remote ueber XTTS v2 auf dem Gaming-PC (keine lokalen Deps noetig)
|
||||
|
||||
# WebSocket-Verbindung zu aria-core
|
||||
websockets
|
||||
@@ -17,3 +16,6 @@ sounddevice
|
||||
|
||||
# Wake-Word Erkennung
|
||||
openwakeword
|
||||
|
||||
# Bild-Resizing (zu grosse Pixel-Bilder shrinken bevor Claude-Vision sie sieht — 5MB-Limit)
|
||||
Pillow
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
@echo off
|
||||
REM ================================================================
|
||||
REM ARIA - Cleanup-Wrapper fuer Windows
|
||||
REM ================================================================
|
||||
REM Ruft cleanup-windows.ps1 mit ExecutionPolicy Bypass auf.
|
||||
REM Funktioniert auch wenn Windows .ps1 direkt nicht startet.
|
||||
REM
|
||||
REM Nutzung:
|
||||
REM cleanup-windows.bat stefan
|
||||
REM cleanup-windows.bat stefan -SkipPrune
|
||||
REM
|
||||
REM Doppelklick funktioniert NICHT (braucht Username als Param).
|
||||
REM Per Konsole aufrufen.
|
||||
REM ================================================================
|
||||
|
||||
powershell.exe -NoProfile -ExecutionPolicy Bypass -File "%~dp0cleanup-windows.ps1" %*
|
||||
@@ -0,0 +1,184 @@
|
||||
# ================================================================
|
||||
# ARIA - Windows / WSL2 / Docker Desktop VHDX Cleanup
|
||||
# ================================================================
|
||||
#
|
||||
# Findet alle WSL2 + Docker Desktop ext4.vhdx Files unter
|
||||
# C:\Users\<USER>\AppData\Local\... und kompaktiert sie via diskpart.
|
||||
# Damit bekommst du Speicherplatz zurueck den du IN den Distros/
|
||||
# Containern geloescht hast (z.B. nach `docker system prune`),
|
||||
# der aber von der VHDX bisher nicht freigegeben wurde.
|
||||
#
|
||||
# Nutzung (PowerShell als ADMIN, oder via cleanup-windows.bat):
|
||||
# .\cleanup-windows.ps1 stefan
|
||||
# .\cleanup-windows.ps1 -User stefan
|
||||
# .\cleanup-windows.ps1 -User stefan -SkipPrune # nur compacten
|
||||
# .\cleanup-windows.ps1 -User stefan -PruneOnly # nur prune
|
||||
#
|
||||
# Was passiert:
|
||||
# 1. Erst (optional): docker system prune + builder prune in WSL2
|
||||
# 2. wsl --shutdown
|
||||
# 3. Alle gefundenen .vhdx Files mit diskpart compact vdisk shrinken
|
||||
#
|
||||
# Hinweis: diskpart braucht KEINE Hyper-V Tools (anders als Optimize-VHD).
|
||||
#
|
||||
# ASCII-only damit Windows-PowerShell 5.1 das File ohne BOM korrekt
|
||||
# parsen kann (UTF-8-Sonderzeichen wuerden sonst als Windows-1252
|
||||
# fehlinterpretiert).
|
||||
# ================================================================
|
||||
|
||||
[CmdletBinding()]
|
||||
param(
|
||||
[Parameter(Mandatory=$true, Position=0,
|
||||
HelpMessage="Dein Windows-Benutzername (z.B. stefan)")]
|
||||
[string]$User,
|
||||
|
||||
[Parameter(HelpMessage="Docker prune ueberspringen - nur compacten")]
|
||||
[switch]$SkipPrune,
|
||||
|
||||
[Parameter(HelpMessage="Docker prune NUR machen, dann beenden")]
|
||||
[switch]$PruneOnly
|
||||
)
|
||||
|
||||
# Defensive: Process-Scope ExecutionPolicy auf Bypass - verhindert dass
|
||||
# Untersaetze (z.B. Module) blockiert werden. Harmless wenn Parent schon
|
||||
# Bypass aufgerufen hat.
|
||||
try { Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass -Force | Out-Null } catch {}
|
||||
|
||||
# Admin-Check + Self-Elevation
|
||||
# Wenn nicht als Admin gestartet -> einmal neu starten als Admin, mit
|
||||
# ExecutionPolicy Bypass + den Original-Argumenten. User muss nur einmal
|
||||
# UAC-Prompt bestaetigen.
|
||||
$isAdmin = ([Security.Principal.WindowsPrincipal] `
|
||||
[Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole(
|
||||
[Security.Principal.WindowsBuiltInRole]::Administrator)
|
||||
if (-not $isAdmin) {
|
||||
Write-Host "-> Starte neu als Administrator (mit ExecutionPolicy Bypass)..." -ForegroundColor Yellow
|
||||
$myPath = $MyInvocation.MyCommand.Path
|
||||
$forwardArgs = @("-NoProfile", "-ExecutionPolicy", "Bypass", "-File", "`"$myPath`"")
|
||||
if ($User) { $forwardArgs += @("-User", $User) }
|
||||
if ($SkipPrune) { $forwardArgs += "-SkipPrune" }
|
||||
if ($PruneOnly) { $forwardArgs += "-PruneOnly" }
|
||||
try {
|
||||
Start-Process powershell.exe -Verb RunAs -ArgumentList $forwardArgs
|
||||
} catch {
|
||||
Write-Host "[FAIL] UAC-Elevation abgebrochen oder fehlgeschlagen." -ForegroundColor Red
|
||||
Write-Host " Rechtsklick auf PowerShell -> 'Als Administrator ausfuehren'" -ForegroundColor Yellow
|
||||
exit 1
|
||||
}
|
||||
exit 0
|
||||
}
|
||||
|
||||
$basePath = "C:\Users\$User\AppData\Local"
|
||||
if (-not (Test-Path $basePath)) {
|
||||
Write-Host "[FAIL] Pfad existiert nicht: $basePath" -ForegroundColor Red
|
||||
Write-Host " Pruefe den Benutzernamen." -ForegroundColor Yellow
|
||||
exit 1
|
||||
}
|
||||
|
||||
Write-Host "================================================================" -ForegroundColor Cyan
|
||||
Write-Host " ARIA Cleanup fuer User: $User" -ForegroundColor Cyan
|
||||
Write-Host "================================================================" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
# -- 1. Docker prune (in WSL2) -----------------------------------
|
||||
if (-not $SkipPrune) {
|
||||
Write-Host "[1/3] Docker Cleanup in WSL2..." -ForegroundColor Yellow
|
||||
Write-Host " docker system prune -a --volumes -f" -ForegroundColor Gray
|
||||
Write-Host " docker builder prune -a -f" -ForegroundColor Gray
|
||||
Write-Host ""
|
||||
try {
|
||||
wsl -e bash -c "docker system prune -a --volumes -f && docker builder prune -a -f"
|
||||
Write-Host " [OK] fertig" -ForegroundColor Green
|
||||
} catch {
|
||||
Write-Host " [WARN] Docker prune fehlgeschlagen (vielleicht laeuft Docker Desktop nicht?)" -ForegroundColor Yellow
|
||||
Write-Host " $_" -ForegroundColor Gray
|
||||
}
|
||||
Write-Host ""
|
||||
if ($PruneOnly) {
|
||||
Write-Host "PruneOnly gesetzt - fertig." -ForegroundColor Cyan
|
||||
exit 0
|
||||
}
|
||||
}
|
||||
|
||||
# -- 2. WSL2 shutdown --------------------------------------------
|
||||
Write-Host "[2/3] WSL2 herunterfahren..." -ForegroundColor Yellow
|
||||
wsl --shutdown
|
||||
Start-Sleep -Seconds 3
|
||||
Write-Host " [OK] fertig" -ForegroundColor Green
|
||||
Write-Host ""
|
||||
|
||||
# -- 3. VHDX-Files finden + compacten ----------------------------
|
||||
Write-Host "[3/3] VHDX-Files suchen + compacten..." -ForegroundColor Yellow
|
||||
Write-Host ""
|
||||
|
||||
$vhdxFiles = @()
|
||||
$vhdxFiles += Get-ChildItem -Path "$basePath\Docker" -Recurse -Filter "*.vhdx" -ErrorAction SilentlyContinue
|
||||
$vhdxFiles += Get-ChildItem -Path "$basePath\Packages" -Recurse -Filter "ext4.vhdx" -ErrorAction SilentlyContinue
|
||||
$vhdxFiles = $vhdxFiles | Sort-Object FullName -Unique
|
||||
|
||||
if ($vhdxFiles.Count -eq 0) {
|
||||
Write-Host " Keine .vhdx Files gefunden." -ForegroundColor Yellow
|
||||
exit 0
|
||||
}
|
||||
|
||||
Write-Host "Gefundene Files (vorher):" -ForegroundColor Cyan
|
||||
foreach ($f in $vhdxFiles) {
|
||||
$sizeGB = [math]::Round($f.Length / 1GB, 2)
|
||||
Write-Host (" {0,8} GB {1}" -f $sizeGB, $f.FullName) -ForegroundColor Gray
|
||||
}
|
||||
Write-Host ""
|
||||
|
||||
$totalBefore = ($vhdxFiles | Measure-Object Length -Sum).Sum
|
||||
|
||||
foreach ($f in $vhdxFiles) {
|
||||
Write-Host "-> Compact: $($f.FullName)" -ForegroundColor White
|
||||
$sizeBefore = [math]::Round($f.Length / 1GB, 2)
|
||||
|
||||
# Temporaeres diskpart-Script schreiben
|
||||
$tmp = [System.IO.Path]::GetTempFileName()
|
||||
@"
|
||||
select vdisk file="$($f.FullName)"
|
||||
attach vdisk readonly
|
||||
compact vdisk
|
||||
detach vdisk
|
||||
exit
|
||||
"@ | Out-File -Encoding ASCII -FilePath $tmp
|
||||
|
||||
try {
|
||||
$output = & diskpart /s $tmp 2>&1
|
||||
# Datei neu lesen - Length ist gecacht
|
||||
$newFile = Get-Item $f.FullName
|
||||
$sizeAfter = [math]::Round($newFile.Length / 1GB, 2)
|
||||
$saved = [math]::Round($sizeBefore - $sizeAfter, 2)
|
||||
if ($saved -gt 0) {
|
||||
Write-Host (" [OK] {0} GB -> {1} GB (gespart: {2} GB)" -f $sizeBefore, $sizeAfter, $saved) -ForegroundColor Green
|
||||
} else {
|
||||
Write-Host (" -- {0} GB -> {1} GB (nichts zu holen - File war schon optimal)" -f $sizeBefore, $sizeAfter) -ForegroundColor DarkGray
|
||||
}
|
||||
} catch {
|
||||
Write-Host " [FAIL] Fehler: $_" -ForegroundColor Red
|
||||
Write-Host " diskpart-Output:" -ForegroundColor DarkGray
|
||||
$output | ForEach-Object { Write-Host " $_" -ForegroundColor DarkGray }
|
||||
} finally {
|
||||
Remove-Item $tmp -ErrorAction SilentlyContinue
|
||||
}
|
||||
Write-Host ""
|
||||
}
|
||||
|
||||
# -- Zusammenfassung ---------------------------------------------
|
||||
$vhdxFilesAfter = @()
|
||||
$vhdxFilesAfter += Get-ChildItem -Path "$basePath\Docker" -Recurse -Filter "*.vhdx" -ErrorAction SilentlyContinue
|
||||
$vhdxFilesAfter += Get-ChildItem -Path "$basePath\Packages" -Recurse -Filter "ext4.vhdx" -ErrorAction SilentlyContinue
|
||||
$vhdxFilesAfter = $vhdxFilesAfter | Sort-Object FullName -Unique
|
||||
$totalAfter = ($vhdxFilesAfter | Measure-Object Length -Sum).Sum
|
||||
|
||||
$savedTotal = [math]::Round(($totalBefore - $totalAfter) / 1GB, 2)
|
||||
|
||||
Write-Host "================================================================" -ForegroundColor Cyan
|
||||
Write-Host (" Gesamt: {0} GB -> {1} GB (gespart: {2} GB)" -f `
|
||||
[math]::Round($totalBefore / 1GB, 2),
|
||||
[math]::Round($totalAfter / 1GB, 2),
|
||||
$savedTotal) -ForegroundColor Cyan
|
||||
Write-Host "================================================================" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
Write-Host "Fertig. Docker Desktop / WSL2 starten ja von alleine wieder beim naechsten Aufruf." -ForegroundColor Green
|
||||
+2016
-576
File diff suppressed because it is too large
Load Diff
+807
-758
File diff suppressed because it is too large
Load Diff
+45
-39
@@ -9,7 +9,7 @@ services:
|
||||
command: >-
|
||||
sh -c "apk add --no-cache openssh-client bash curl &&
|
||||
npm install -g @anthropic-ai/claude-code claude-max-api-proxy &&
|
||||
DIST=$(find /usr/local/lib -path '*/claude-max-api-proxy/dist' -type d | head -1) &&
|
||||
DIST=$$(find /usr/local/lib -path '*/claude-max-api-proxy/dist' -type d | head -1) &&
|
||||
sed -i 's/startServer({ port })/startServer({ port, host: process.env.HOST || \"127.0.0.1\" })/' $$DIST/server/standalone.js &&
|
||||
sed -i 's/if (model\.includes/if ((model||\"claude-sonnet-4\").includes/g' $$DIST/adapter/cli-to-openai.js &&
|
||||
sed -i '1i\\function _t(c){return typeof c===\"string\"?c:Array.isArray(c)?c.filter(function(b){return b.type===\"text\"}).map(function(b){return b.text||\"\"}).join(\"\"):String(c)}' $$DIST/adapter/openai-to-cli.js &&
|
||||
@@ -28,38 +28,40 @@ services:
|
||||
networks:
|
||||
- aria-net
|
||||
|
||||
# ─── OpenClaw (ARIA Gehirn) ─────────────────────────────
|
||||
aria:
|
||||
image: ghcr.io/openclaw/openclaw:latest
|
||||
container_name: aria-core
|
||||
hostname: aria-wohnung
|
||||
privileged: true # ARIAs Wohnung — sie hat die Schlüssel
|
||||
# ─── Qdrant (Vector-DB fuer ARIAs Gedaechtnis) ────────
|
||||
# Storage liegt im Repo-Bind-Mount aria-data/brain/qdrant.
|
||||
# Damit Backup/Export/Import komplett ueber das Filesystem gehen.
|
||||
qdrant:
|
||||
image: qdrant/qdrant:latest
|
||||
container_name: aria-qdrant
|
||||
volumes:
|
||||
- ./aria-data/brain/qdrant:/qdrant/storage
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- aria-net
|
||||
|
||||
# ─── ARIA Brain (Agent + Memory) ─────────────────────────
|
||||
# Loest das alte aria-core (OpenClaw) ab. Vector-DB-basiertes
|
||||
# Memory, eigener Agent-Loop, SSH zur aria-wohnung-VM.
|
||||
brain:
|
||||
build: ./aria-brain
|
||||
container_name: aria-brain
|
||||
hostname: aria-wohnung-brain # damit ssh known_hosts stabil bleibt
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway" # Zugriff auf die VM via SSH
|
||||
depends_on:
|
||||
- qdrant
|
||||
- proxy
|
||||
ports:
|
||||
- "3001:3001" # Diagnostic Web-UI (laeuft im shared network)
|
||||
environment:
|
||||
- CANVAS_HOST=127.0.0.1
|
||||
- OPENCLAW_GATEWAY_TOKEN=${ARIA_AUTH_TOKEN}
|
||||
- DEFAULT_MODEL=proxy/claude-sonnet-4
|
||||
- RATE_LIMIT_PER_USER=30
|
||||
- DISPLAY=:0
|
||||
- QDRANT_HOST=aria-qdrant
|
||||
- QDRANT_PORT=6333
|
||||
- PROXY_URL=http://proxy:3456
|
||||
- ARIA_AUTH_TOKEN=${ARIA_AUTH_TOKEN:-}
|
||||
volumes:
|
||||
- openclaw-config:/home/node/.openclaw # OpenClaw Config (persistiert Model + Auth)
|
||||
- ./aria-data/brain:/home/node/.openclaw/workspace/memory
|
||||
- ./aria-data/skills:/home/node/.openclaw/workspace/skills
|
||||
- ./aria-data/config/AGENT.md:/home/node/.openclaw/workspace/AGENT.md
|
||||
- ./aria-data/config/USER.md:/home/node/.openclaw/workspace/USER.md
|
||||
- ./aria-data/config/BOOTSTRAP.md:/home/node/.openclaw/workspace/BOOTSTRAP.md
|
||||
- ./aria-data/config/BOOTSTRAP.md:/home/node/.openclaw/workspace/CLAUDE.md
|
||||
- ./aria-data/config/openclaw.env:/home/node/.openclaw/workspace/.env
|
||||
- claude-config:/home/node/.claude # Claude Code Settings (Permissions)
|
||||
- ./aria-data/ssh:/home/node/.ssh # SSH Keys fuer VM-Zugriff
|
||||
- /tmp/.X11-unix:/tmp/.X11-unix
|
||||
- /var/run/docker.sock:/var/run/docker.sock # VM von innen verwalten
|
||||
- aria-shared:/shared # Shared Volume fuer Datei-Austausch (Bridge <> Core)
|
||||
- ./aria-data/brain/data:/data # Memory-Cache + Skills + Models (bind-mount fuer Export)
|
||||
- ./aria-data/brain-import:/import:ro # Quell-MDs fuer den initialen Memory-Import (read-only)
|
||||
- ./aria-data/ssh:/root/.ssh # SSH-Keys fuer aria-wohnung (geteilt mit Proxy)
|
||||
- aria-shared:/shared # gleicher Austausch-Speicher wie Bridge
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- aria-net
|
||||
@@ -69,12 +71,13 @@ services:
|
||||
build: ./bridge
|
||||
container_name: aria-bridge
|
||||
depends_on:
|
||||
- aria
|
||||
network_mode: "service:aria" # Teilt Netzwerk mit aria-core → localhost:18789
|
||||
- brain
|
||||
networks:
|
||||
- aria-net
|
||||
ports:
|
||||
- "3001:3001" # Diagnostic Web-UI (Diagnostic teilt Netzwerk mit Bridge)
|
||||
volumes:
|
||||
- ./aria-data/voices:/voices:ro # TTS Stimmen
|
||||
- ./aria-data/config/aria.env:/config/aria.env
|
||||
- aria-shared:/shared # Shared Volume fuer Datei-Austausch (Bridge <> Core)
|
||||
- aria-shared:/shared # Shared Volume fuer Datei-Austausch
|
||||
# Audio-Zugriff
|
||||
- /run/user/1000/pulse:/run/user/1000/pulse
|
||||
- /dev/snd:/dev/snd
|
||||
@@ -82,6 +85,7 @@ services:
|
||||
- /dev/snd
|
||||
environment:
|
||||
- PULSE_SERVER=unix:/run/user/1000/pulse/native
|
||||
- BRAIN_URL=http://aria-brain:8080
|
||||
- ARIA_AUTH_TOKEN=${ARIA_AUTH_TOKEN:-}
|
||||
- RVS_HOST=${RVS_HOST:-}
|
||||
- RVS_PORT=${RVS_PORT:-443}
|
||||
@@ -91,19 +95,23 @@ services:
|
||||
restart: unless-stopped
|
||||
|
||||
# ─── Diagnostic (Selbstcheck-UI und Einstellungen) ────
|
||||
# Teilt Netzwerk mit Bridge, damit der Diagnostic-Server die
|
||||
# Bridge auf localhost erreichen kann.
|
||||
diagnostic:
|
||||
build: ./diagnostic
|
||||
container_name: aria-diagnostic
|
||||
depends_on:
|
||||
- aria
|
||||
network_mode: "service:aria" # Teilt Netzwerk mit aria-core → localhost:18789
|
||||
- bridge
|
||||
network_mode: "service:bridge"
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock # Container Restart + Brain-Export/Import
|
||||
- ./aria-data/config/diag-state:/data # Persistenter State (aktive Session etc.)
|
||||
- aria-shared:/shared # Shared Volume (Uploads + Config)
|
||||
- aria-shared:/shared # Shared Volume (Uploads + Config + Voices)
|
||||
- ./aria-data/brain:/brain # Brain-Export/Import (tar.gz aus Bind-Mount)
|
||||
environment:
|
||||
- ARIA_AUTH_TOKEN=${ARIA_AUTH_TOKEN:-}
|
||||
- PROXY_URL=http://proxy:3456
|
||||
- BRAIN_URL=http://aria-brain:8080
|
||||
- RVS_HOST=${RVS_HOST:-}
|
||||
- RVS_PORT=${RVS_PORT:-443}
|
||||
- RVS_TLS=${RVS_TLS:-true}
|
||||
@@ -112,9 +120,7 @@ services:
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
openclaw-config: # Persistiert ~/.openclaw (Model, Auth, Sessions)
|
||||
claude-config: # Persistiert ~/.claude (Permissions, Settings)
|
||||
aria-shared: # Datei-Austausch zwischen Bridge und Core
|
||||
aria-shared: # Datei-Austausch zwischen Bridge / Brain / Diagnostic
|
||||
|
||||
networks:
|
||||
aria-net:
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
#!/bin/bash
|
||||
# ════════════════════════════════════════════════
|
||||
# ARIA — Piper Stimmen herunterladen
|
||||
# Ramona (Alltag) + Thorsten (epische Momente)
|
||||
# ════════════════════════════════════════════════
|
||||
|
||||
set -e
|
||||
|
||||
VOICES_DIR="aria-data/voices"
|
||||
BASE_URL="https://huggingface.co/rhasspy/piper-voices/resolve/main/de/de_DE"
|
||||
|
||||
mkdir -p "$VOICES_DIR"
|
||||
cd "$VOICES_DIR"
|
||||
|
||||
echo "Lade ARIA Stimmen..."
|
||||
echo ""
|
||||
|
||||
echo "[1/4] Ramona (Modell)..."
|
||||
wget -q --show-progress "$BASE_URL/ramona/low/de_DE-ramona-low.onnx"
|
||||
|
||||
echo "[2/4] Ramona (Config)..."
|
||||
wget -q --show-progress "$BASE_URL/ramona/low/de_DE-ramona-low.onnx.json"
|
||||
|
||||
echo "[3/4] Thorsten (Modell)..."
|
||||
wget -q --show-progress "$BASE_URL/thorsten/high/de_DE-thorsten-high.onnx"
|
||||
|
||||
echo "[4/4] Thorsten (Config)..."
|
||||
wget -q --show-progress "$BASE_URL/thorsten/high/de_DE-thorsten-high.onnx.json"
|
||||
|
||||
echo ""
|
||||
echo "Stimmen geladen!"
|
||||
ls -lh *.onnx
|
||||
@@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
# ════════════════════════════════════════════════════════════
|
||||
# ARIA — Setup-Script
|
||||
#
|
||||
# Aktuell nur noch der .env-Bootstrap (Tokens + RVS). Alle weiteren
|
||||
# Settings landen ueber die Diagnostic in /shared/config/runtime.json
|
||||
# (persistent in der "Datenbank").
|
||||
#
|
||||
# Im Phase-A-Cleanup-Status: System-Prompt-Files liegen unter
|
||||
# aria-data/brain-import/ und werden vom neuen Agent-Framework
|
||||
# spaeter importiert. OpenClaw laeuft noch ohne Persoenlichkeit.
|
||||
# ════════════════════════════════════════════════════════════
|
||||
|
||||
set -e
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
if [ ! -f .env ]; then
|
||||
if [ -f .env.example ]; then
|
||||
cp .env.example .env
|
||||
echo "✓ .env erstellt aus .env.example — Tokens jetzt eintragen!"
|
||||
else
|
||||
echo "⚠ Keine .env.example gefunden — manuell anlegen."
|
||||
fi
|
||||
else
|
||||
echo ".env existiert bereits — uebersprungen."
|
||||
fi
|
||||
|
||||
# ── Brain-Verzeichnisse anlegen (Bind-Mounts fuer aria-brain + aria-qdrant)
|
||||
# Inhalt ist gitignored — wird ueber Diagnostic-Export/Import gesichert.
|
||||
mkdir -p aria-data/brain/data aria-data/brain/qdrant
|
||||
echo "✓ aria-data/brain/{data,qdrant} bereit"
|
||||
@@ -1,60 +1,240 @@
|
||||
# ARIA Issues & Features
|
||||
|
||||
## Audio-Verhalten in der App
|
||||
|
||||
So sollte die App in den verschiedenen Phasen mit fremden Audio-Apps
|
||||
(Spotify, YouTube, Podcasts etc.) und dem eigenen Mikro umgehen.
|
||||
Wenn was anders ist, ist's ein Bug.
|
||||
|
||||
| Phase | Andere App (Spotify) | ARIA-Mikro | Hintergrund-Service |
|
||||
|------------------------------|----------------------|---------------------|---------------------|
|
||||
| Idle / Ohr aus | spielt frei | aus | aus |
|
||||
| Wake-Word lauscht (armed) | spielt frei | passiv (openWakeWord) | aktiv ('wake') |
|
||||
| User-Aufnahme laeuft | pausiert (EXCLUSIVE) | Recording | aktiv ('rec') |
|
||||
| Aufnahme zu Ende | resumed | aus | (rec released) |
|
||||
| ARIA denkt/schreibt (~20s) | spielt frei | aus | (kein Slot) |
|
||||
| TTS startet | pausiert (DUCK) | aus (oder barge) | aktiv ('tts') |
|
||||
| TTS spielt (auch GPU-Pausen) | bleibt pausiert | barge wenn Wake-Word| aktiv |
|
||||
| TTS zu Ende | nach 800ms resumed | (Conversation-Window)| (tts released) |
|
||||
| Eingehender Anruf (auch VoIP)| — | Mikro pausiert | aus |
|
||||
| Anruf vorbei | — | Mikro wieder armed | aktiv ('wake') |
|
||||
| Anruf vorbei (Auto-Resume) | pausiert wieder | aus | aktiv ('tts') |
|
||||
| Neue Frage waehrend Anruf | — | Mikro pausiert | (rec waehrend Anruf nicht) |
|
||||
| Anruf vorbei nach neuer Frage | (siehe TTS-Phasen) | (siehe TTS-Phasen) | (tts gewinnt, alter Resume verworfen) |
|
||||
|
||||
Wichtige Mechanismen:
|
||||
- **Underrun-Schutz** im PcmStreamPlayer fuettert Stille rein wenn die
|
||||
Bridge in Render-Pausen liefert — Spotify bleibt durchgehend pausiert,
|
||||
auch zwischen den Saetzen einer langen Antwort.
|
||||
- **Conversation-Focus** (nur bei Wake-Word 'conversing') haelt den
|
||||
AudioFocus dauerhaft. Bei reinem Tap-to-Talk oder Text-Chat greift's
|
||||
nicht — Spotify darf in der Denk-Phase ruhig weiterspielen.
|
||||
- **Foreground-Service** (mediaPlayback|microphone) haelt App-Prozess
|
||||
am Leben damit TTS/Mikro/Wake-Word auch bei minimierter App weiter-
|
||||
laufen. Notification zeigt aktuellen Status ("ARIA spricht/hoert
|
||||
zu/bereit").
|
||||
- **Anruf-Erkennung** ueber TelephonyManager (klassisch) + AudioFocus-
|
||||
Loss-Listener mit Polling-Fallback (VoIP wie WhatsApp/Signal/Discord).
|
||||
- **Auto-Resume nach Anruf**: beim Halt wird die Wiedergabe-Position
|
||||
gemerkt (Date.now() - playbackStart - leadingSilence). Nach Auflegen
|
||||
wartet die App bis zu 30s auf den WAV-Cache und spielt dann ab der
|
||||
gemerkten Position weiter. Wenn das Telefonat länger als die Antwort
|
||||
dauerte, ist der Cache schon fertig — instant Resume.
|
||||
- **Neue Frage waehrend Anruf** (Text-Chat geht trotz Telefonat): die
|
||||
neue Antwort ueberschreibt den pending Resume. _handlePcmChunkImpl
|
||||
stoppt einen ggf. laufenden resumeSound und setzt pausedMessageId
|
||||
zurueck wenn die neue Stream-messageId abweicht. Die letzte Antwort
|
||||
gewinnt immer.
|
||||
- **Audio-Ausgabe trotz aktivem Telefonat**: ARIA antwortet auch waehrend
|
||||
eines Telefonats per Lautsprecher (Telefon-Audio geht ueber separaten
|
||||
Stream zur Gegenseite). haltAllPlayback wird nur beim STATE-WECHSEL
|
||||
ringing/offhook gerufen — wenn der Anruf schon laeuft (offhook→offhook),
|
||||
triggert eine neue Frage keinen Halt mehr.
|
||||
|
||||
## Erledigt
|
||||
|
||||
### Bugs / Fixes
|
||||
|
||||
- [x] Diagnostic: "ARIA denkt..." bleibt nicht mehr stehen
|
||||
- [x] App: "ARIA denkt..." Indicator + Abbrechen-Button (Bridge spiegelt agent_activity via RVS)
|
||||
- [x] Textnachrichten werden von ARIA beantwortet (Bridge chat handler fix)
|
||||
- [x] Voice-Auswahl funktioniert wieder: speaker_wav als Basename statt Pfad fuer daswer123 local-Mode
|
||||
- [x] Diagnostic-Voice-Wechsel resettet alle App-lokalen Voice-Overrides via type "config"
|
||||
- [x] Streaming TTS Stop-Race: Writer wartet auf playbackHeadPosition vor stop()/release() — keine abgeschnittenen Saetze mehr
|
||||
- [x] App: Audioausgabe hoert nicht mehr mitten im Satz auf (playbackHeadPosition wait + Stop-Race fix)
|
||||
- [x] AudioFocus.release wartet auf echten Playback-Ende — kein Volume-Hochfahren mehr mid-Antwort
|
||||
- [x] App Mute-/Auto-Playback-Bug: Closure-Bug geloest (ttsCanPlayRef live-gespiegelt, nicht mehr stale)
|
||||
- [x] App Zombie-Recording: Ohr-aus kill laufende Aufnahme damit der Aufnahme-Button weiter funktioniert
|
||||
- [x] Whisper transkribiert Voice-Uploads nicht mehr mit hardcoded "small" — aktuelles Modell wird behalten, kein unnoetiger Modell-Swap
|
||||
- [x] RVS/WebSocket maxPayload 50MB: voice_upload mit WAV als base64 sprengt kein Frame-Limit mehr
|
||||
- [x] Wake-Word Embedding rank-4 Fix (Pipeline-Bug der das Triggern verhinderte) + Frame-Count aus Modell-Metadaten lesen
|
||||
- [x] PCM-Underrun-Schutz: Stille-Fill in Render-Pausen verhindert Spotify-Auto-Resume nach 10s Stillstand
|
||||
- [x] Conversation-Focus-Lifecycle: AudioFocus haengt am Wake-Word-State 'conversing' statt an einzelnen Streams — Spotify bleibt durchgehend gepaust, auch zwischen mehreren Antworten
|
||||
- [x] Voice-Override behaelt Stimme ueber alle TTS-Calls einer Antwort (vorher: nach erstem TTS-Call zurueck auf Default)
|
||||
- [x] Sprachnachricht-Bubble defensiv: STT-Result fuegt neue Bubble hinzu wenn Placeholder fehlt (Race-Schutz)
|
||||
- [x] Bild + Text als EINE Anfrage: Bridge buffert files 800ms, merged mit folgendem chat-Text zu einem send_to_core (statt zwei getrennten ARIA-Antworten)
|
||||
- [x] Diagnostic→App: persistente RVS-Connection statt frische pro Send (Race-Probleme mit Zombie-WS geloest)
|
||||
- [x] Textauswahl in Bubbles wieder funktional (nested Text+onPress raus, dataDetectorType="all" macht Links automatisch klickbar)
|
||||
- [x] **Placeholder-Race bei parallelen Sprachnachrichten geloest**: jede Aufnahme bekommt eine eindeutige audioRequestId, Bridge gibt sie ans STT-Result zurueck — App matcht jetzt punktgenau die richtige Bubble statt per Substring
|
||||
- [x] Mikro-Offen-Toast "🎤 sprich jetzt" erscheint erst wenn audioService.startRecording wirklich erfolgreich war (statt ~400ms vorher beim Wake-Word-Detect)
|
||||
- [x] Sprachnachrichten ohne STT-Result werden nach 60s+Aufnahmedauer automatisch entfernt (sicher genug fuer 5-30min-Aufnahmen, schnell genug fuer leere Wake-Word-Echos)
|
||||
- [x] VAD adaptive Baseline robuster: minimum statt avg + Cap auf -50dB bis -28dB (Stille) / -40dB bis -18dB (Speech) — keine "tote" VAD-Konfiguration mehr bei lauter Umgebung oder Wake-Word-Echo
|
||||
- [x] Push-to-Talk raus, nur noch Tap-to-Talk (verhinderte Touch-Race-Probleme)
|
||||
- [x] Manueller Mikro-Stop beendet Wake-Word-Konversation: Tap auf Mikro-Knopf waehrend conversing → audio raus + zurueck zu armed (= Wake-Word lauscht wieder, kein Auto-Mikro nach ARIAs Antwort). VAD-Auto-Stop bleibt bei Multi-Turn
|
||||
- [x] **Wake-Word pausiert bei Anruf**: phoneCall ruft pauseForCall (openWakeWord.stop) bei RINGING/OFFHOOK, resumeFromCall bei IDLE. Pre-Call-State wird gemerkt — armed bleibt armed, conversing degraded zu armed (User soll nicht in halbem Dialog landen)
|
||||
- [x] **App-Resume-Cooldown**: Wechsel von Background → Foreground triggert keinen falschen Wake-Word-Trigger mehr. AppState-Listener setzt 1.5s Cooldown in dem onWakeDetected-Events ignoriert werden (Audio-Pegel-Spike beim AudioFocus-Switch sonst als Wake-Word interpretiert)
|
||||
- [x] Background-Mikro robust: acquireBackgroundAudio('rec'/'wake') wird jetzt VOR AudioRecord.startRecording gerufen — Foreground-Service mit foregroundServiceType=microphone muss aktiv sein bevor das Mikro greift, sonst blockiert Android ab 11+ den Background-Zugriff
|
||||
- [x] **Stille-Pegel manuell setzbar** (Settings → Spracheingabe): Override-Wert in dB von -55 bis -15, default "automatisch". Info-Button mit Modal erklaert die Skala (niedriger = sensibler, hoeher = robuster gegen Hintergrundlaerm). Bei manuell gesetztem Wert wird die adaptive Baseline ignoriert
|
||||
- [x] **Kurze TTS-Texte (1-3 Worte) spielen jetzt ab** — auf OnePlus A12 stallte AudioTrack mit `pos=0` weil der Default-Start-Threshold `bufferSize/2` (= 2s) bei kurzen Streams nie ueberschritten wurde. Fix: `setStartThresholdInFrames(100ms)` direkt nach dem Track-Build (API 31+). Buffer auf 4s entkoppelt von Pre-Roll, `play()` wird beim allerersten data-chunk gerufen
|
||||
- [x] **Mute-Button stoppt jetzt auch laufenden PCM-Stream** — `pcmStreamActive` wurde beim isFinal-Chunk schon false gesetzt, der AudioTrack spielte aber noch sekundenlang aus seinem Buffer. `stopPlayback()` uebersprang darum `PcmStreamPlayer.stop()`. Fix: stop() immer rufen (ist idempotent), kein Flag-Check mehr
|
||||
- [x] **GPS-Permission im Manifest + Runtime-Request** beim Settings-Toggle — vorher fehlten ACCESS_COARSE_LOCATION / ACCESS_FINE_LOCATION komplett. `Geolocation.getCurrentPosition` schlug lautlos fehl, App sendete nie ein location-Feld
|
||||
- [x] **GPS-Position auch im STT-Payload an Diagnostic** — die App sendet location einmal im audio-Payload. Die Bridge nutzte sie zwar (ging in aria-core's Kontext rein), reichte sie aber nicht im STT-broadcast an Diagnostic durch. Diagnostic zeigte darum bei Spracheingaben nie den GPS-Block, obwohl der "GPS einblenden"-Toggle aktiv war
|
||||
- [x] **Auto-Resume nach Anruf — pcmBuffer bleibt erhalten**: `haltAllPlayback` leerte den pcmBuffer mid-Anruf, isFinal schrieb dann eine leere WAV. Neue `pauseForCall`-Methode statt `haltAllPlayback`: AudioTrack stoppt + Focus released, `pcmBuffer` und `pcmMessageId` bleiben — chunks werden weiter gesammelt damit isFinal die WAV schreibt und resumeFromInterruption sie findet. Plus `captureInterruption` idempotent gemacht (ringing → offhook ueberschreibt nicht)
|
||||
- [x] **Replay-Resume nach Anruf**: `_firePlaybackStarted` ueberschrieb `currentPlaybackMsgId` mit leerem pcmMessageId — captureInterruption hatte nichts zu merken. Plus Regex `[0-9a-f-]+\.wav` matchte nicht alle Dateinamen. Plus `_playFromPathAtPosition` aktualisiert jetzt das Tracking damit ein zweiter Anruf in derselben Antwort auch funktioniert
|
||||
- [x] **`pauseForCall` setzt `isPlaying` zurueck**: vorher haengten weitere Play-Button-Klicks nach Anruf, weil `playAudio` bei `isPlaying=true` den `_playNext`-Pfad ueberspringt
|
||||
- [x] **Play-Button rendert neu wenn Cache-Datei weg ist**: vorher checkte der Button nur `if (item.audioPath)` — auf eine geloeschte Cache-Datei zeigte das aber stillschweigend ins Leere. Jetzt RNFS.exists-Check mit Fallback auf `tts_request` an die Bridge → F5-TTS rendert neu, WAV wandert zurueck in den Cache
|
||||
- [x] **Bridge WebSocket max_size 50 MB**: Python `websockets.connect` hat 1 MiB Default — Stefan's 4MB JPEG (5.78 MB Base64) sprengte das, Bridge-Connection wurde silent gedroppt. f5tts/whisper-bridges hatten max_size schon, nur aria_bridge war vergessen
|
||||
- [x] **Bridge resized Bilder >2 MB serverseitig auf 1568px**: Claude-Vision-API hat ~5 MB Base64-Limit. Galerie-Bilder via `react-native-image-picker` sind clientseitig schon klein, Buroklammer/DocumentPicker reichte das rohe File durch — Claude lieferte leere Antwort. Pillow im Bridge-Container, nur fuer JPEG/PNG/WebP/GIF (PDFs/ZIPs/SVGs unangetastet)
|
||||
- [x] **Bridge `chat:error` liest auch `errorMessage`**: OpenClaw legt bei state=error den Text dort statt in `error` ab → Bridge meldete generisches "[Fehler] Unbekannt", echter Fehler nur in Container-Logs. Plus: `chat:final` ohne text wird jetzt mit Hinweis-Bubble an die App gemeldet (statt stumm), z.B. wenn Vision das Bild silent ablehnt
|
||||
- [x] **Cache-Cleanup beim App-Start** — orphane `aria_tts_*.wav` Files (>5 min) im CachesDirectoryPath werden weggeraeumt, sammeln sich sonst an wenn Sound mid-playback gestoppt wird (Anruf, Mute, Barge-In) und der completion-Callback nicht feuert. Plus neuer Settings-Button "TTS-Cache leeren" mit Live-Groessenanzeige
|
||||
- [x] **Verbose-Logging-Toggle in Settings → Protokoll**: `console.log` global stummschaltbar (warn/error bleiben aktiv) — spart adb-logcat-Speicher wenn alles laeuft
|
||||
- [x] **800 ms-Delay vor Anruf-Auto-Resume**: ARIA's neuer Focus-Request kollidierte sonst mit Spotify's Auto-Resume nach Anruf-Ende. System haengt noch im IN_CALL→NORMAL-Mode-Uebergang, Spotify sieht Loss → Loss und bleibt pausiert. Mit Delay schafft Spotify den Resume-Schritt, dann pausiert ARIA wieder ordnungsgemaess
|
||||
- [x] **Mute-Button = Stop fuer aktuelle Antwort**: vorher startete eine NEUE PCM-Chunk-Sequenz nach Mute-aus die alte Antwort weiter wo sie war (funktionierte 2x, dann nicht mehr weil isFinal schon kam). Jetzt mit `_stoppedMessageId`-Tracking: bei Mute wird die aktive msgId gemerkt, alle weiteren chunks dieser msgId bleiben silent — auch wenn Mute zurueckgenommen wird. Reset bei neuer msgId, neue Antworten spielen normal
|
||||
- [x] **Spotify resumed nach Mute-Stop**: `stopPlayback` released seinen TRANSIENT-Focus (USAGE_ASSISTANT) sauber → Spotify bekommt GAIN-Event und resumed automatisch. Ein zwischenzeitlich eingebauter `kickReleaseMedia` (USAGE_MEDIA + GAIN) verhinderte das Auto-Resume sogar (Spotify interpretierte es als "user-action stopp") — wieder rausgenommen
|
||||
- [x] **ARIA kann Dateien an User zurueckgeben** (PDFs, Bilder, Office-Docs, Markdown, ZIPs, ...): ARIA setzt am Antwort-Ende `[FILE: /shared/uploads/aria_<name>.<ext>]` Marker, Bridge parsed sie raus (TTS liest's nicht vor) und sendet `file_from_aria`-Event ueber RVS. App zeigt Anhang-Bubble + Klick oeffnet via Android-Intent-Picker (`FileOpenerModule`, FileProvider), Diagnostic zeigt Bubble + PDFs/Bilder neuer Tab, andere als Download. Mehrere Marker = mehrere Bubbles, nicht-existente Marker werden mit Hinweis an User gemeldet (statt silent gedroppt)
|
||||
- [x] **External Bilder/Dateien werden serverseitig persistiert**: ARIA laed externe URLs (Wikipedia, Wiki Commons) mit curl runter und gibt sie via `[FILE: ...]`-Marker zurueck — bleibt permanent im Chat auch wenn die Online-Quelle stirbt. System-Prompt instruiert sie das Pattern zu nutzen
|
||||
- [x] **ARIA-Datei-Bubbles ueberleben Browser-Refresh**: Diagnostic-Server parsed beim `load_chat_history` die Marker aus dem OpenClaw-Session-File und schickt `aria_file`-Eintraege mit, sodass die Anhang-Bubbles nach F5 wiederhergestellt werden. Plus: `/shared/uploads/`-Bildpfade werden im History-Render auch als Inline-Image gerendert (vorher nur in live-Bubbles)
|
||||
- [x] **"ARIA reparieren"-Button** in App + Diagnostic: triggert `openclaw doctor --fix` ueber RVS → Bridge → Diagnostic HTTP-API. Fix fuer stuck Runs ohne SSH
|
||||
- [x] **"ARIA hart neu starten"-Button**: docker compose-Restart ueber Docker-Socket-API im Diagnostic-Server. Mit Confirmation in der App, fuer Faelle wo doctor nicht reicht (alive aber haengender Run)
|
||||
- [x] **Auto-Compact nach N Messages**: bei zu langer Session wirft Linux beim Subprocess-spawn E2BIG (Argument list too long, ~128KB-2MB Limit). Bridge zaehlt User-Messages; bei `COMPACT_AFTER_MESSAGES` (env, default 140) werden Sessions geleert + Container neu gestartet, User bekommt Hinweis-Bubble. Plus manueller "🧹 Konversation komprimieren"-Button in App-Settings und Diagnostic
|
||||
- [x] **`[FILE: ...]`-Marker-Filter ueberall in Diagnostic**: Filter direkt in `addChat` damit er fuer alle Code-Pfade greift (chat_final, proxy_result, History-Load, ...) — vorher rutschten Marker als Text durch wenn sie nicht ueber chat_final kamen
|
||||
- [x] **Mehrere `[FILE: ...]`-Marker in einer Antwort**: Bridge zerlegt sauber in mehrere file_from_aria-Events, ARIA muss nicht selbst zwei Antworten posten. Bei nicht-existenten Files erscheint ein User-Hinweis statt silent skip
|
||||
- [x] **Inline-Bilder in Chat-Nachrichten** (App): ``- und plain-`https://image.png`-URLs werden als Image-Vorschau unter dem Text gerendert. Mit `react-native-svg` auch SVG-URLs inline
|
||||
- [x] **SVG-Anhaenge** werden korrekt gerendert: ChatImage-Komponente erkennt `.svg`-Endung und nutzt SvgUri statt Image (RN-Image kann SVG nicht). Vollbild-Modal genauso, mit `preserveAspectRatio="xMidYMid meet"` damit SVGs nicht gestreckt werden
|
||||
- [x] **Pinch-Zoom + Pan im Vollbild-Modal** (App): neue `ZoomableImage`-Komponente, reine RN-Implementation mit PanResponder+Animated, ohne externe Lib. 2-Finger-Pinch 1x..5x, 1-Finger-Pan wenn gezoomt, Doppel-Tap toggelt 1x↔2.5x. Plus ✕-Close-Button damit Tap-to-Close nicht mit Pan-Gesten kollidiert
|
||||
- [x] **ARIA-Abkuerzung ausgeschrieben**: in App → Einstellungen → Ueber und Diagnostic → Einstellungen ist jetzt erklaert: "ARIA — Autonomous Reasoning & Intelligence Assistant"
|
||||
- [x] **`init.sh`** legt fehlende Config-Dateien aus *.example-Vorlagen an — frischer Clone laeuft ohne Anleitung an
|
||||
- [x] **`USER.md` privat**: aus dem Repo genommen (enthielt interne Tool-Liste mit Gitea-URL etc.). Vorlage als `USER.md.example` checked-in, lokales File via `.gitignore` ausgeschlossen
|
||||
|
||||
### App Features
|
||||
|
||||
- [x] Bildupload funktioniert (Shared Volume /shared/uploads/)
|
||||
- [x] Sprachnachrichten werden als Text angezeigt (STT → Chat-Bubble)
|
||||
- [x] Cache leeren + Auto-Download von Anhaengen
|
||||
- [x] ARIA liest Nachrichten vor (TTS via Piper)
|
||||
- [x] ARIA liest Nachrichten vor (TTS via Piper, später ersetzt)
|
||||
- [x] Autoscroll zur letzten Nachricht (inverted FlatList)
|
||||
- [x] Bilder im Chat groesser + Vollbild-Vorschau
|
||||
- [x] Ohr-Button → Gespraechsmodus (Auto-Aufnahme nach ARIA-Antwort)
|
||||
- [x] Play-Button in ARIA-Nachrichten fuer Sprachwiedergabe
|
||||
- [x] Chat-Suche in der App (Lupe in Statusleiste)
|
||||
- [x] Watchdog mit Container-Restart (2min Warnung → 5min doctor --fix → 8min Restart)
|
||||
- [x] Abbrechen-Button im Diagnostic Chat
|
||||
- [x] Nachrichten Backup on-the-fly (/shared/config/chat_backup.jsonl)
|
||||
- [x] Grosse Nachrichten satzweise aufteilen fuer TTS
|
||||
- [x] RVS Nachrichten vom Smartphone gehen durch
|
||||
- [x] Stimmen-Einstellungen (Ramona/Thorsten, Speed pro Stimme)
|
||||
- [x] Highlight-Trigger konfigurierbar in Diagnostic
|
||||
- [x] XTTS v2 Integration (Gaming-PC, GPU, Voice Cloning)
|
||||
- [x] Stimmen-Einstellungen (Ramona/Thorsten, Speed pro Stimme — durch XTTS/F5-TTS ersetzt)
|
||||
- [x] Highlight-Trigger konfigurierbar in Diagnostic (spaeter komplett entfernt — war Piper-Relikt)
|
||||
- [x] XTTS v2 Integration (Gaming-PC, GPU, Voice Cloning) — durch F5-TTS ersetzt
|
||||
- [x] XTTS Voice Cloning (Audio-Samples hochladen, eigene Stimme)
|
||||
- [x] TTS Engine waehlbar (Piper/XTTS) in Diagnostic + App
|
||||
- [x] TTS Engine waehlbar (Piper/XTTS) — Piper raus, XTTS raus, jetzt nur F5-TTS
|
||||
- [x] Auto-Update System (APK via RVS WebSocket)
|
||||
- [x] Auto-Update: APK-Installation via FileProvider
|
||||
- [x] Auto-Update: "Auf Updates pruefen" Button in App-Einstellungen
|
||||
- [x] Audio-Queue (sequentielle Wiedergabe, kein Ueberlappen)
|
||||
- [x] Textnachrichten werden von ARIA beantwortet (Bridge chat handler fix)
|
||||
- [x] Mehrere Anhaenge + Text vor dem Senden (Pending-Vorschau)
|
||||
- [x] Paste-Support fuer Bilder in Diagnostic Chat
|
||||
- [x] Markdown-Bereinigung fuer TTS (fett, kursiv, code, links, etc.)
|
||||
- [x] SSH Volume read-write fuer Proxy (kein -F Workaround mehr)
|
||||
- [x] Diagnostic: Sessions als Markdown exportieren (Download-Button)
|
||||
- [x] Speech Gate: Aufnahme wird verworfen wenn keine Sprache erkannt (verhindert dass Umgebungsgeraeusche an Whisper gehen)
|
||||
- [x] Session-Persistenz: Gewaehlte Session bleibt ueber Container-Restarts erhalten (sessionFromFile-Flag, atomic write)
|
||||
- [x] Diagnostic: "ARIA denkt..." bleibt nicht mehr stehen (pipelineEnd broadcastet immer idle, auch bei Timeout/Fehler/Disconnect)
|
||||
- [x] App: "ARIA denkt..." Indicator + Abbrechen-Button (Bridge spiegelt agent_activity via RVS)
|
||||
- [x] Whisper STT: Model-Auswahl in Diagnostic (tiny/base/small/medium/large-v3), Hot-Reload in Bridge, Default auf medium
|
||||
- [x] Speech Gate: Aufnahme wird verworfen wenn keine Sprache erkannt
|
||||
- [x] Session-Persistenz: Gewaehlte Session bleibt ueber Container-Restarts erhalten
|
||||
- [x] Whisper STT: Model-Auswahl in Diagnostic (tiny/base/small/medium/large-v3), Hot-Reload
|
||||
- [x] App: Audio-Aufnahme explizit 16kHz mono (spart Resample, optimal fuer Whisper)
|
||||
- [x] Streaming TTS: PCM-Stream → AudioTrack MODE_STREAM, keine WAV-Gaps
|
||||
- [x] Piper komplett entfernt
|
||||
- [x] Gespraechsmodus: Speech-Gate strenger (-28dB / 500ms)
|
||||
- [x] Diagnostic: Archivierte Session-Versionen (.reset.*) angezeigt + exportierbar
|
||||
- [x] tools/export-jsonl-to-md.js: CLI-Konverter fuer Session-JSONL zu Markdown (mit OpenClaw raus)
|
||||
- [x] NO_REPLY-Filter in Bridge + Diagnostic
|
||||
- [x] Audio-Ducking + Exklusiv-Focus (Kotlin AudioFocusModule)
|
||||
- [x] TTS-Cleanup serverseitig: Code-Bloecke raus, Einheiten ausgeschrieben, Abkuerzungen buchstabiert, URLs zu "ein Link"
|
||||
- [x] QR-Code Onboarding: Diagnostic generiert QR, App scannt
|
||||
- [x] TTS-Audio-Cache im Filesystem: WAV pro messageId, Play-Button spielt aus Cache
|
||||
- [x] Config via Diagnostic: RVS-Credentials + Auth-Token persistiert in /shared/config/runtime.json
|
||||
- [x] Disk-Voll Banner in Diagnostic: rotes Overlay + copy-baren Cleanup-Befehlen (safe + aggressiv)
|
||||
- [x] cleanup.sh: kombinierter Docker-Aufraeum-Befehl (safe / --full)
|
||||
- [x] Streaming TTS Pre-Roll: AudioTrack play() startet erst wenn 2.5s gepuffert sind
|
||||
- [x] Leading-Silence (200ms) am Stream-Anfang — AudioTrack faehrt sauber an
|
||||
- [x] Pre-Roll-Buffer einstellbar in App-Settings (1.0-6.0s, Default 3.5s)
|
||||
- [x] Fade-In auf erstem PCM-Chunk (120ms) — versteckt XTTS/F5-TTS Warmup-Glitches
|
||||
- [x] Decimal-zu-Worte fuer TTS (0.1 → null komma eins, mit IP-Schutz-Lookahead)
|
||||
- [x] Generic Acronym-Buchstabieren (XTTS → X T T S, USB → U S B, ueber expliziter Liste)
|
||||
- [x] voice_preload/voice_ready: Stille Mini-Render bei Voice-Wechsel + Toast/Status "bereit"
|
||||
- [x] Whisper STT auf die Gamebox ausgelagert (faster-whisper CUDA, float16) — neuer aria-whisper-bridge Container
|
||||
- [x] aria-bridge: STT primaer remote (Gamebox), Fallback lokal nach 45s Timeout
|
||||
- [x] Whisper-Modell hot-swap auf Gamebox via config-Broadcast aus Diagnostic
|
||||
- [x] **F5-TTS ersetzt XTTS komplett** — neuer aria-f5tts-bridge Container, Voice Cloning, satzweises Streaming
|
||||
- [x] Voice-Upload mit Whisper-Auto-Transkription — User muss keinen Referenz-Text eintippen
|
||||
- [x] Audio-Pause statt Ducking: Spotify/YouTube pausieren komplett waehrend TTS (TRANSIENT statt MAY_DUCK)
|
||||
- [x] VAD-Stille einstellbar in App-Settings (1.0-8.0s, Default 2.8s)
|
||||
- [x] MAX_RECORDING auf 120s — laengere Erklaerungen moeglich
|
||||
- [x] F5-TTS: Referenz-WAV-Preprocessing — Loudness-Normalisierung -16 LUFS + Silence-Trim + 10s Clip fuer konsistente Cloning-Quali
|
||||
- [x] F5-TTS: deutsches Fine-Tune (aihpi/F5-TTS-German, Vocos-Variante) via hf:// Pfad in Diagnostic konfigurierbar
|
||||
- [x] Dynamischer STT-Timeout in aria-bridge: 300s waehrend whisper-bridge 'loading', 45s wenn 'ready'
|
||||
- [x] service_status Broadcasts: f5tts/whisper melden Lade-Status, Banner in Diagnostic (unten rechts) + App (oben)
|
||||
- [x] config_request Pattern: Bridges fragen beim Connect die aktuelle Voice-Config an, aria-bridge antwortet
|
||||
- [x] F5-TTS Tuning via Diagnostic (Modell-ID, Checkpoint, cfg_strength, nfe_step) statt ENV-Vars — Hot-Reload bei Modell-Wechsel
|
||||
- [x] Conversation-Window: Gespraechsmodus endet nach X Sekunden Stille (1.0-20.0s, Default 8s, einstellbar in Settings)
|
||||
- [x] Porcupine Wake-Word-Integration in der App (durch openWakeWord ersetzt)
|
||||
- [x] HF-Cache als Bind-Mount statt Docker Volume — kein .vhdx-Bloat auf Docker Desktop / Windows
|
||||
- [x] cleanup-windows.ps1 / .bat: VHDX-Cleanup via diskpart (ohne Hyper-V) mit Self-Elevation
|
||||
- [x] App Text-Rendering: Nachrichten selektierbar + Autolink fuer URLs/E-Mails/Telefonnummern (Browser/Mail/Dialer)
|
||||
- [x] TTS-Wiedergabegeschwindigkeit pro Geraet einstellbar (Settings → 0.5-2.0x in 0.1-Schritten, Default 1.0)
|
||||
- [x] Diagnostic: Voice-Preview-Modal (Play-Icon vor Delete-X, Textfeld mit Default, WAV im Browser abspielen)
|
||||
- [x] **Wake-Word komplett on-device via openWakeWord (ONNX Runtime)** — Porcupine raus, kein API-Key/keine Lizenzgebuehren mehr. Mitgelieferte Keywords: hey_jarvis, computer, alexa, hey_mycroft, hey_rhasspy
|
||||
- [x] APK ABI-Split auf arm64-v8a — von ~136 MB auf ~35 MB, Auto-Update-Downloads aufs Phone deutlich kleiner
|
||||
- [x] PhoneStateListener: TTS pausiert bei eingehendem Anruf (READ_PHONE_STATE Permission)
|
||||
- [x] **VoIP-Anrufe** (WhatsApp/Signal/Discord/Teams) erkannt via AudioFocus-Loss-Listener + getMode-Polling-Fallback (alle 3s)
|
||||
- [x] **Auto-Resume nach Anruf**: ARIAs unterbrochene Antwort spielt nach dem Auflegen ab der gemerkten Position weiter (Date.now()-Tracking + WAV-Cache, 30s-Wartezeit auf final-Marker bei kurzem Telefonat)
|
||||
- [x] **Neue Frage waehrend Telefonat** ueberschreibt pending Auto-Resume — letzte Antwort gewinnt, alter resumeSound wird gestoppt
|
||||
- [x] **Audio-Ausgabe waehrend aktivem Telefonat** funktioniert (haltAllPlayback nur bei state-Wechsel idle→ringing/offhook, nicht bei offhook→offhook)
|
||||
- [x] **PcmPlaybackFinished-Event** im Native: AudioFocus wird erst released wenn AudioTrack wirklich durch ist (vorher: end()-Cap nach 0.5s → Spotify spielte 32s parallel zu ARIA)
|
||||
- [x] **APK-Cache-Cleanup robuster**: durchsucht jetzt CachesDirectoryPath + DocumentDirectoryPath + ExternalCachesDirectoryPath + ExternalDirectoryPath statt nur Caches. Plus manueller Button "Update-Cache leeren" in Settings → Speicher mit Live-Anzeige der aktuellen Groesse
|
||||
- [x] Diagnostic-Chat: bubblige Formatierung, mehrzeiliges Eingabefeld (textarea, Enter sendet, Shift+Enter neue Zeile)
|
||||
- [x] Adaptive VAD-Schwelle: Baseline aus den ersten 500ms Mic-Pegel, Stille = baseline+6dB / Sprache = baseline+12dB
|
||||
- [x] Max-Aufnahmedauer konfigurierbar in Settings (1-30 min, Default 5 min) — laengere Diktate moeglich
|
||||
- [x] Barge-In: User kann ARIA waehrend Antwort/Tool-Use unterbrechen, alte Aktivitaet wird abgebrochen, Bridge gibt aria-core einen Kontext-Hint dass es eine Korrektur ist
|
||||
- [x] Settings-Sub-Screens: 8 Kategorien (Verbindung, Allgemein, Spracheingabe, Wake-Word, Sprachausgabe, Speicher, Protokoll, Ueber) statt langer Liste
|
||||
- [x] **Bereit-Sound (Airplane Ding-Dong) wenn Mikro nach Wake-Word offen** — akustische Bestaetigung statt nur Toast. Toggle in Settings → Wake-Word, default aktiv
|
||||
- [x] **Wake-Word parallel zu TTS** mit AcousticEchoCanceler: User sagt "Computer" waehrend ARIA spricht → TTS verstummt sofort, neue Aufnahme startet
|
||||
- [x] **GPS-Position mitsenden**: Toggle in Settings → Allgemein → Standort, persistiert in AsyncStorage. Wenn aktiv wird lat/lon mit jeder chat/audio-Message mitgegeben. Bridge prefixed den Text fuer aria-core mit GPS-Hint (mit Anweisung dass die Position nur bei Bedarf erwaehnt wird)
|
||||
- [x] **Background Audio Service**: TTS, Wake-Word-Lauschen UND Aufnahme laufen auch bei minimierter App weiter. Foreground-Service mit foregroundServiceType=mediaPlayback|microphone, persistente Notification mit dynamischem Text ("ARIA spricht" / "ARIA hoert zu" / "ARIA bereit")
|
||||
|
||||
### Infrastruktur
|
||||
|
||||
- [x] Watchdog mit Container-Restart (2min Warnung → 5min doctor --fix → 8min Restart)
|
||||
- [x] Nachrichten Backup on-the-fly (/shared/config/chat_backup.jsonl)
|
||||
- [x] RVS Nachrichten vom Smartphone gehen durch
|
||||
- [x] SSH Volume read-write fuer Proxy (kein -F Workaround mehr)
|
||||
|
||||
## Offen
|
||||
|
||||
### Bugs (Prioritaet)
|
||||
- [ ] App: Audioausgabe hoert ab und zu einfach auf (mitten im Satz oder zwischen Chunks)
|
||||
### Brain (Phase B — der grosse Refactor laeuft)
|
||||
|
||||
- [x] aria-brain Container-Skeleton (FastAPI + Qdrant + sentence-transformers)
|
||||
- [x] Memory CRUD via Diagnostic-Gehirn-Tab (Add/Edit/Delete + Search + Filter)
|
||||
- [x] Gehirn-Export/Import als tar.gz (komplett: Memories + Skills + Qdrant)
|
||||
- [x] Voice-Bridge: aria-core-spezifische Logik raus (doctor_fix, aria_restart, aria_session_reset, compact_after)
|
||||
- [x] aria-core komplett aus docker-compose.yml raus, Watchdog raus
|
||||
- [x] Diagnostic: Wipe-All-Button (Memory + Stimmen + Settings)
|
||||
- [x] Voice Export/Import (Diagnostic + XTTS-Bridge auf Gaming-PC)
|
||||
- [x] F5/Whisper-Settings als JSON-Bundle Export/Import
|
||||
- [x] Datei-Manager (Diagnostic + App-Modal): /shared/uploads/ verwalten, Delete spiegelt sich live in den Chat-Bubbles
|
||||
- [ ] **Phase B Punkt 2:** Migration `aria-data/brain-import/` → atomare Memory-Punkte (Identity / Rules / Preferences / Tools)
|
||||
- [ ] **Phase B Punkt 3:** Brain Conversation-Loop (Single-Chat UI + Rolling Window + Memory-Destillat)
|
||||
- [ ] **Phase B Punkt 4:** Skills-System (Manifest, venv/local-bin, README pro Skill, Diagnostic-Skills-Tab, Export/Import)
|
||||
|
||||
### App Features
|
||||
- [ ] Wake Word on-device (Porcupine "ARIA" Keyword, Phase 2 — passives Lauschen)
|
||||
- [ ] Chat-History zuverlaessiger laden (AsyncStorage Race Condition)
|
||||
- [ ] Background Audio Service (TTS auch bei minimierter App)
|
||||
|
||||
### TTS / Audio
|
||||
- [ ] XTTS Audio-Streaming (PCM-Stream statt WAV-Dateien, eliminiert Stottern komplett)
|
||||
- [ ] Audio-Normalisierung (Lautstaerke zwischen Chunks angleichen)
|
||||
- [ ] Piper Voices Download ueber Diagnostic (neue Sprachen/Stimmen)
|
||||
- [ ] Custom-Wake-Word-Upload via Diagnostic (eigene .onnx-Files ohne App-Rebuild)
|
||||
|
||||
### Architektur
|
||||
- [ ] Bilder: Claude Vision direkt nutzen (aktuell nur Dateipfad an ARIA)
|
||||
- [ ] Auto-Compacting und Memory/Brain Verwaltung (SQLite?)
|
||||
- [ ] Diagnostic: System-Info Tab (Container-Status, Disk, RAM, CPU)
|
||||
- [ ] RVS Zombie-Connections endgueltig loesen
|
||||
- [ ] Gamebox: kleine Web-Oberflaeche fuer Credentials/Server-Config oder zentral aus Diagnostic per RVS push
|
||||
|
||||
+17
-1
@@ -17,6 +17,19 @@ const ALLOWED_TYPES = new Set([
|
||||
"xtts_request", "xtts_response", "xtts_list_voices", "xtts_voices_list", "voice_upload", "xtts_voice_saved",
|
||||
"update_check", "update_available", "update_download", "update_data",
|
||||
"agent_activity", "cancel_request",
|
||||
"audio_pcm",
|
||||
"file_from_aria",
|
||||
"container_restart",
|
||||
"file_list_request", "file_list_response",
|
||||
"file_delete_request", "file_deleted",
|
||||
"xtts_export_voice", "xtts_voice_exported",
|
||||
"xtts_import_voice", "xtts_voice_imported",
|
||||
"skill_created",
|
||||
"xtts_delete_voice",
|
||||
"voice_preload", "voice_ready",
|
||||
"stt_request", "stt_response",
|
||||
"service_status",
|
||||
"config_request",
|
||||
]);
|
||||
|
||||
// Token-Raum: token -> { clients: Set<ws> }
|
||||
@@ -49,7 +62,10 @@ function cleanupRooms() {
|
||||
|
||||
// ── WebSocket-Server starten ────────────────────────────────────────
|
||||
|
||||
const wss = new WebSocketServer({ port: PORT });
|
||||
// maxPayload 50MB: TTS-Streaming + Voice-Upload (WAV als base64) +
|
||||
// audio_pcm Chunks koennen die ws-Library Default 1MB ueberschreiten.
|
||||
// Default-Limit war der Killer fuer die voice_upload Pipeline.
|
||||
const wss = new WebSocketServer({ port: PORT, maxPayload: 50 * 1024 * 1024 });
|
||||
|
||||
wss.on("listening", () => {
|
||||
log(`RVS läuft auf Port ${PORT} | Max Sessions: ${MAX_SESSIONS}`);
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
# HuggingFace Model-Cache (Whisper + F5-TTS, geteilt zwischen den
|
||||
# beiden Bridges via Bind-Mount, kann mehrere GB werden)
|
||||
hf-cache/
|
||||
|
||||
# Voice-Samples (lokal, gehoert nicht ins Repo)
|
||||
voices/
|
||||
|
||||
# Docker .env
|
||||
.env
|
||||
@@ -1,5 +0,0 @@
|
||||
FROM node:22-alpine
|
||||
WORKDIR /app
|
||||
COPY bridge.js package.json ./
|
||||
RUN npm install --production
|
||||
CMD ["node", "bridge.js"]
|
||||
-312
@@ -1,312 +0,0 @@
|
||||
/**
|
||||
* ARIA XTTS Bridge — Verbindet XTTS v2 Server mit dem RVS
|
||||
*
|
||||
* Empfaengt tts_request ueber RVS → rendert Audio via XTTS API → sendet zurueck
|
||||
* Empfaengt voice_upload → speichert Voice-Sample fuer Cloning
|
||||
* Empfaengt xtts_list_voices → listet verfuegbare Stimmen
|
||||
*/
|
||||
|
||||
const WebSocket = require("ws");
|
||||
const http = require("http");
|
||||
const https = require("https");
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
||||
const XTTS_API_URL = process.env.XTTS_API_URL || "http://xtts:8000";
|
||||
const RVS_HOST = process.env.RVS_HOST || "";
|
||||
const RVS_PORT = process.env.RVS_PORT || "443";
|
||||
const RVS_TLS = process.env.RVS_TLS || "true";
|
||||
const RVS_TLS_FALLBACK = process.env.RVS_TLS_FALLBACK || "true";
|
||||
const RVS_TOKEN = process.env.RVS_TOKEN || "";
|
||||
const VOICES_DIR = "/voices";
|
||||
|
||||
function log(msg) {
|
||||
console.log(`[${new Date().toISOString()}] ${msg}`);
|
||||
}
|
||||
|
||||
// ── RVS Verbindung ──────────────────────────────────
|
||||
|
||||
let rvsWs = null;
|
||||
let retryDelay = 2;
|
||||
|
||||
function connectRVS(forcePlain) {
|
||||
if (!RVS_HOST || !RVS_TOKEN) {
|
||||
log("RVS nicht konfiguriert — beende");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const useTls = RVS_TLS === "true" && !forcePlain;
|
||||
const proto = useTls ? "wss" : "ws";
|
||||
const url = `${proto}://${RVS_HOST}:${RVS_PORT}?token=${RVS_TOKEN}`;
|
||||
|
||||
log(`Verbinde zu RVS: ${proto}://${RVS_HOST}:${RVS_PORT}`);
|
||||
|
||||
const ws = new WebSocket(url);
|
||||
|
||||
ws.on("open", () => {
|
||||
log("RVS verbunden — warte auf TTS-Requests");
|
||||
rvsWs = ws;
|
||||
retryDelay = 2;
|
||||
|
||||
// Keepalive
|
||||
setInterval(() => {
|
||||
if (ws.readyState === WebSocket.OPEN) {
|
||||
ws.ping();
|
||||
ws.send(JSON.stringify({ type: "heartbeat", timestamp: Date.now() }));
|
||||
}
|
||||
}, 25000);
|
||||
});
|
||||
|
||||
ws.on("message", async (raw) => {
|
||||
try {
|
||||
const msg = JSON.parse(raw.toString());
|
||||
|
||||
if (msg.type === "xtts_request") {
|
||||
await handleTTSRequest(msg.payload);
|
||||
} else if (msg.type === "voice_upload") {
|
||||
await handleVoiceUpload(msg.payload);
|
||||
} else if (msg.type === "xtts_list_voices") {
|
||||
await handleListVoices();
|
||||
}
|
||||
} catch (err) {
|
||||
log(`Fehler: ${err.message}`);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on("close", () => {
|
||||
log("RVS Verbindung geschlossen");
|
||||
rvsWs = null;
|
||||
setTimeout(() => connectRVS(), Math.min(retryDelay * 1000, 30000));
|
||||
retryDelay = Math.min(retryDelay * 2, 30);
|
||||
});
|
||||
|
||||
ws.on("error", (err) => {
|
||||
log(`RVS Fehler: ${err.message}`);
|
||||
if (useTls && RVS_TLS_FALLBACK === "true") {
|
||||
log("TLS fehlgeschlagen — Fallback auf ws://");
|
||||
ws.removeAllListeners();
|
||||
try { ws.close(); } catch (_) {}
|
||||
connectRVS(true);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// ── TTS Request Handler ─────────────────────────────
|
||||
|
||||
async function handleTTSRequest(payload) {
|
||||
const { text, voice, requestId, language } = payload;
|
||||
if (!text) return;
|
||||
|
||||
// Markdown + Sonderzeichen entfernen fuer natuerliche Sprache
|
||||
let cleanText = text
|
||||
.replace(/\*\*([^*]+)\*\*/g, "$1") // **fett** → fett
|
||||
.replace(/\*([^*]+)\*/g, "$1") // *kursiv* → kursiv
|
||||
.replace(/`([^`]+)`/g, "$1") // `code` → code
|
||||
.replace(/```[\s\S]*?```/g, "") // Code-Bloecke entfernen
|
||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") // [text](url) → text
|
||||
.replace(/#{1,6}\s*/g, "") // ### Ueberschriften → entfernen
|
||||
.replace(/>\s*/g, "") // > Zitate → entfernen
|
||||
.replace(/[-*]\s+/g, "") // - Listen → entfernen
|
||||
.replace(/\n{2,}/g, ". ") // Mehrere Newlines → Punkt
|
||||
.replace(/\n/g, ", ") // Einzelne Newlines → Komma
|
||||
.replace(/\s{2,}/g, " ") // Mehrfach-Leerzeichen
|
||||
.replace(/["""„]/g, "") // Anfuehrungszeichen entfernen
|
||||
.replace(/\(\)/g, "") // Leere Klammern
|
||||
.trim();
|
||||
|
||||
// Text in Saetze aufteilen, dann zu Chunks von 2-3 Saetzen zusammenfassen
|
||||
// (mehr Kontext = konsistentere Stimme/Lautstaerke, aber nicht zu lang fuer WebSocket)
|
||||
const sentences = cleanText.split(/(?<=[.!?])\s+/)
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0)
|
||||
.map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen
|
||||
|
||||
const MAX_CHUNK_CHARS = 150; // Max ~150 Zeichen pro Chunk (schnelles Rendering, Preloading reicht)
|
||||
const chunks = [];
|
||||
let currentChunk = '';
|
||||
for (const sentence of sentences) {
|
||||
if (currentChunk && (currentChunk.length + sentence.length + 2) > MAX_CHUNK_CHARS) {
|
||||
chunks.push(currentChunk);
|
||||
currentChunk = sentence;
|
||||
} else {
|
||||
currentChunk = currentChunk ? currentChunk + ', ' + sentence : sentence;
|
||||
}
|
||||
}
|
||||
if (currentChunk) chunks.push(currentChunk);
|
||||
if (chunks.length === 0) return;
|
||||
|
||||
log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze → ${chunks.length} Chunks, voice: ${voice || "default"}, lang: ${language || "de"})`);
|
||||
|
||||
try {
|
||||
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
||||
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
||||
|
||||
// Streaming: Chunk rendern → sofort senden → naechster Chunk
|
||||
// App spielt mit Preloading-Queue nahtlos ab
|
||||
let sentCount = 0;
|
||||
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
try {
|
||||
const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null);
|
||||
|
||||
if (audioBuffer && audioBuffer.length > 100) {
|
||||
log(`TTS [${i + 1}/${chunks.length}]: ${(audioBuffer.length / 1024).toFixed(0)}KB — "${chunk.slice(0, 50)}"`);
|
||||
|
||||
sendToRVS({
|
||||
type: "xtts_response",
|
||||
payload: {
|
||||
requestId: `${requestId || ""}_${i}`,
|
||||
base64: audioBuffer.toString("base64"),
|
||||
mimeType: "audio/wav",
|
||||
voice: voice || "default",
|
||||
engine: "xtts",
|
||||
part: i + 1,
|
||||
totalParts: chunks.length,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
sentCount++;
|
||||
}
|
||||
} catch (chunkErr) {
|
||||
log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
|
||||
}
|
||||
}
|
||||
|
||||
log(`TTS komplett: ${sentCount}/${chunks.length} Chunks gestreamt`);
|
||||
} catch (err) {
|
||||
log(`TTS Fehler: ${err.message}`);
|
||||
sendToRVS({
|
||||
type: "xtts_response",
|
||||
payload: { requestId, error: err.message },
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function callXTTSAPI(text, language, speakerWav) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const body = JSON.stringify({
|
||||
text,
|
||||
language,
|
||||
speaker_wav: speakerWav || "",
|
||||
});
|
||||
|
||||
const url = new URL(`${XTTS_API_URL}/tts_to_audio/`);
|
||||
const options = {
|
||||
hostname: url.hostname,
|
||||
port: url.port,
|
||||
path: url.pathname,
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"Content-Length": Buffer.byteLength(body),
|
||||
},
|
||||
timeout: 60000,
|
||||
};
|
||||
|
||||
const req = http.request(options, (res) => {
|
||||
const chunks = [];
|
||||
res.on("data", (chunk) => chunks.push(chunk));
|
||||
res.on("end", () => {
|
||||
if (res.statusCode === 200) {
|
||||
resolve(Buffer.concat(chunks));
|
||||
} else {
|
||||
reject(new Error(`XTTS API HTTP ${res.statusCode}: ${Buffer.concat(chunks).toString().slice(0, 200)}`));
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
req.on("error", reject);
|
||||
req.on("timeout", () => { req.destroy(); reject(new Error("XTTS API Timeout (60s)")); });
|
||||
req.write(body);
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
// ── Voice Upload Handler ────────────────────────────
|
||||
|
||||
async function handleVoiceUpload(payload) {
|
||||
const { name, samples } = payload;
|
||||
if (!name || !samples || !Array.isArray(samples) || samples.length === 0) {
|
||||
log("Voice Upload: Ungueltige Daten");
|
||||
return;
|
||||
}
|
||||
|
||||
log(`Voice Upload: "${name}" (${samples.length} Samples)`);
|
||||
|
||||
try {
|
||||
// Alle Samples zusammenfuegen
|
||||
const buffers = samples.map(s => Buffer.from(s.base64, "base64"));
|
||||
const combined = Buffer.concat(buffers);
|
||||
|
||||
// Als WAV speichern
|
||||
fs.mkdirSync(VOICES_DIR, { recursive: true });
|
||||
const filePath = path.join(VOICES_DIR, `${name.replace(/[^a-zA-Z0-9_-]/g, "_")}.wav`);
|
||||
fs.writeFileSync(filePath, combined);
|
||||
|
||||
log(`Voice gespeichert: ${filePath} (${(combined.length / 1024).toFixed(0)}KB)`);
|
||||
|
||||
sendToRVS({
|
||||
type: "xtts_voice_saved",
|
||||
payload: { name, size: combined.length, path: filePath },
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
} catch (err) {
|
||||
log(`Voice Upload Fehler: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Voice List Handler ──────────────────────────────
|
||||
|
||||
async function handleListVoices() {
|
||||
try {
|
||||
const files = fs.existsSync(VOICES_DIR)
|
||||
? fs.readdirSync(VOICES_DIR).filter(f => f.endsWith(".wav"))
|
||||
: [];
|
||||
|
||||
const voices = files.map(f => ({
|
||||
name: path.basename(f, ".wav"),
|
||||
file: f,
|
||||
size: fs.statSync(path.join(VOICES_DIR, f)).size,
|
||||
}));
|
||||
|
||||
log(`Stimmen: ${voices.length} verfuegbar`);
|
||||
|
||||
sendToRVS({
|
||||
type: "xtts_voices_list",
|
||||
payload: { voices },
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
} catch (err) {
|
||||
log(`Stimmen-Liste Fehler: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ── RVS senden ──────────────────────────────────────
|
||||
|
||||
function sendToRVS(msg) {
|
||||
if (rvsWs && rvsWs.readyState === WebSocket.OPEN) {
|
||||
rvsWs.send(JSON.stringify(msg));
|
||||
}
|
||||
}
|
||||
|
||||
// ── Start ───────────────────────────────────────────
|
||||
|
||||
log("ARIA XTTS Bridge startet...");
|
||||
log(`XTTS API: ${XTTS_API_URL}`);
|
||||
log(`RVS: ${RVS_HOST}:${RVS_PORT}`);
|
||||
|
||||
// Warten bis XTTS API erreichbar ist
|
||||
function waitForXTTS(callback, attempts) {
|
||||
if (attempts <= 0) { log("XTTS API nicht erreichbar — starte trotzdem"); callback(); return; }
|
||||
http.get(`${XTTS_API_URL}/docs`, (res) => {
|
||||
log(`XTTS API erreichbar (HTTP ${res.statusCode})`);
|
||||
callback();
|
||||
}).on("error", () => {
|
||||
log(`XTTS API noch nicht bereit — warte (${attempts} Versuche uebrig)...`);
|
||||
setTimeout(() => waitForXTTS(callback, attempts - 1), 10000); // 10s statt 5s (Model laden dauert)
|
||||
});
|
||||
}
|
||||
|
||||
waitForXTTS(() => connectRVS(), 30); // Max 5min warten
|
||||
+55
-26
@@ -1,7 +1,7 @@
|
||||
# ════════════════════════════════════════════════
|
||||
# ARIA XTTS v2 — GPU TTS Server
|
||||
# ARIA Gamebox Stack — GPU F5-TTS + Whisper STT
|
||||
# Laeuft auf dem Gaming-PC (RTX 3060)
|
||||
# Verbindet sich zum RVS fuer TTS-Requests
|
||||
# Verbindet sich zum RVS fuer TTS/STT-Requests
|
||||
# ════════════════════════════════════════════════
|
||||
#
|
||||
# Voraussetzungen:
|
||||
@@ -10,15 +10,18 @@
|
||||
# - .env mit RVS-Verbindungsdaten
|
||||
#
|
||||
# Start: docker compose up -d
|
||||
# Test: curl http://localhost:8000/docs
|
||||
# ════════════════════════════════════════════════
|
||||
|
||||
services:
|
||||
|
||||
# ─── XTTS v2 API Server (GPU) ─────────────────
|
||||
xtts:
|
||||
image: daswer123/xtts-api-server:latest
|
||||
container_name: aria-xtts
|
||||
# ─── F5-TTS Bridge (GPU) ──────────────────────
|
||||
# Ersetzt den frueheren XTTS-Stack. Empfaengt xtts_request via RVS,
|
||||
# rendert via F5-TTS mit Voice-Cloning, streamt PCM an die App.
|
||||
# Voice-Upload: speichert WAV und laesst whisper-bridge den Referenz-
|
||||
# text transkribieren — der User muss nichts eintippen.
|
||||
f5tts-bridge:
|
||||
build: ./f5tts
|
||||
container_name: aria-f5tts-bridge
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
@@ -26,31 +29,57 @@ services:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
ports:
|
||||
- "8000:8020"
|
||||
volumes:
|
||||
- xtts-models:/app/xtts_models # Model-Cache (~2GB)
|
||||
- ./voices:/voices # Custom Voice Samples
|
||||
- ./voices:/voices # WAV + TXT Referenz
|
||||
- ./hf-cache:/root/.cache/huggingface # HF-Cache als Bind-Mount.
|
||||
# Direkt sichtbar im xtts/hf-cache/,
|
||||
# einfach manuell zu loeschen, kein
|
||||
# Docker-Desktop .vhdx Bloat.
|
||||
# Wird mit whisper-bridge geteilt.
|
||||
environment:
|
||||
- COQUI_TOS_AGREED=1
|
||||
restart: unless-stopped
|
||||
|
||||
# ─── XTTS Bridge (verbindet zu RVS) ───────────
|
||||
xtts-bridge:
|
||||
build: .
|
||||
container_name: aria-xtts-bridge
|
||||
depends_on:
|
||||
- xtts
|
||||
volumes:
|
||||
- ./voices:/voices # Shared mit XTTS-Server
|
||||
environment:
|
||||
- XTTS_API_URL=http://xtts:8020
|
||||
# Bootstrap-only — alle anderen F5-TTS-Settings (Modell, cfg_strength,
|
||||
# nfe_step, Custom-Checkpoint) kommen ueber Diagnostic via RVS-config.
|
||||
- RVS_HOST=${RVS_HOST}
|
||||
- RVS_PORT=${RVS_PORT:-443}
|
||||
- RVS_TLS=${RVS_TLS:-true}
|
||||
- RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true}
|
||||
- RVS_TOKEN=${RVS_TOKEN}
|
||||
- F5TTS_DEVICE=${F5TTS_DEVICE:-cuda}
|
||||
- VOICES_DIR=/voices
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
xtts-models:
|
||||
# ─── Whisper STT (GPU) ────────────────────────
|
||||
# Faster-Whisper auf der Gamebox statt auf der VM (CPU) —
|
||||
# deutlich schneller. Verbindet sich selbst per WebSocket an
|
||||
# den RVS und nimmt dort stt_request Nachrichten der aria-bridge
|
||||
# entgegen, antwortet mit stt_response. Zusaetzlich nutzt die
|
||||
# f5tts-bridge Whisper intern fuer die Referenz-Transkription bei
|
||||
# Voice-Uploads. Laedt das Modell beim Start vor; auf Config-
|
||||
# Broadcasts (Diagnostic → whisperModel) wird zur Laufzeit hot-
|
||||
# swapped.
|
||||
whisper-bridge:
|
||||
build: ./whisper
|
||||
container_name: aria-whisper-bridge
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
environment:
|
||||
- RVS_HOST=${RVS_HOST}
|
||||
- RVS_PORT=${RVS_PORT:-443}
|
||||
- RVS_TLS=${RVS_TLS:-true}
|
||||
- RVS_TLS_FALLBACK=${RVS_TLS_FALLBACK:-true}
|
||||
- RVS_TOKEN=${RVS_TOKEN}
|
||||
- WHISPER_MODEL=${WHISPER_MODEL:-small}
|
||||
- WHISPER_DEVICE=${WHISPER_DEVICE:-cuda}
|
||||
- WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
|
||||
- WHISPER_LANGUAGE=${WHISPER_LANGUAGE:-de}
|
||||
volumes:
|
||||
- ./hf-cache:/root/.cache/huggingface # gleicher Cache wie f5tts-bridge —
|
||||
# ein Modell muss nur einmal pro
|
||||
# Maschine geladen werden, kein
|
||||
# Re-Download bei Container-Restart.
|
||||
restart: unless-stopped
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip ffmpeg git \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# PyTorch CUDA-Wheels zuerst (f5-tts zieht sonst CPU-only Torch rein)
|
||||
RUN pip3 install --no-cache-dir torch==2.3.1 torchaudio==2.3.1 \
|
||||
--index-url https://download.pytorch.org/whl/cu121
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY bridge.py .
|
||||
|
||||
CMD ["python3", "bridge.py"]
|
||||
@@ -0,0 +1,930 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ARIA F5-TTS Bridge — laeuft auf der Gamebox (RTX 3060).
|
||||
|
||||
Empfaengt xtts_request via RVS → F5-TTS Voice Cloning auf GPU → streamt
|
||||
16-bit PCM Chunks als audio_pcm Nachrichten zurueck an die App.
|
||||
|
||||
Voice-Layout im VOICES_DIR:
|
||||
{name}.wav — Referenz-Audio (6-10s, 24kHz mono empfohlen)
|
||||
{name}.txt — Referenz-Text (UTF-8, was im WAV gesprochen wird)
|
||||
|
||||
Beim voice_upload senden wir intern einen stt_request an die whisper-bridge
|
||||
und legen die Transkription als .txt ab — der User muss keinen Text eingeben.
|
||||
|
||||
Env:
|
||||
RVS_HOST, RVS_PORT, RVS_TLS, RVS_TLS_FALLBACK, RVS_TOKEN
|
||||
F5TTS_MODEL Default: F5TTS_v1_Base
|
||||
F5TTS_DEVICE Default: cuda
|
||||
VOICES_DIR Default: /voices
|
||||
"""
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import websockets
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger("f5tts-bridge")
|
||||
# HuggingFace + Torch download-Logs etwas daempfen
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
||||
|
||||
RVS_HOST = os.getenv("RVS_HOST", "").strip()
|
||||
RVS_PORT = int(os.getenv("RVS_PORT", "443"))
|
||||
RVS_TLS = os.getenv("RVS_TLS", "true").lower() == "true"
|
||||
RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true").lower() == "true"
|
||||
RVS_TOKEN = os.getenv("RVS_TOKEN", "").strip()
|
||||
|
||||
# F5-TTS Konfiguration
|
||||
# ─────────────────────────────────────────────────────────────────
|
||||
# Defaults sind hard-coded — bewusst KEINE ENV-Vars (ausser F5TTS_DEVICE,
|
||||
# weil Hardware-Bootstrap). Alle Settings werden zur Laufzeit via RVS
|
||||
# config-Broadcast aus Diagnostic uebersteuert (Felder f5ttsModel,
|
||||
# f5ttsCkptFile, f5ttsVocabFile, f5ttsCfgStrength, f5ttsNfeStep).
|
||||
F5TTS_DEVICE = os.getenv("F5TTS_DEVICE", "cuda") # nur Bootstrap
|
||||
|
||||
DEFAULT_F5TTS_MODEL = "F5TTS_v1_Base"
|
||||
DEFAULT_F5TTS_CKPT_FILE = "" # leer = Default-Checkpoint von HF
|
||||
DEFAULT_F5TTS_VOCAB_FILE = "" # leer = Default-Vocab vom Modell
|
||||
# cfg_strength: wie stark der Generator am Referenz-Voice klebt.
|
||||
# Default F5-TTS = 2.0. Bei nicht-EN/CN Sprachen (Deutsch!) hilft 2.5+,
|
||||
# damit das Modell nicht in eine andere Sprache abrutscht.
|
||||
DEFAULT_F5TTS_CFG_STRENGTH = 2.5
|
||||
DEFAULT_F5TTS_NFE_STEP = 32
|
||||
|
||||
VOICES_DIR = Path(os.getenv("VOICES_DIR", "/voices"))
|
||||
|
||||
PCM_CHUNK_BYTES = 8192 # ~170ms @ 24kHz mono s16
|
||||
TARGET_SR = 24000 # F5-TTS native
|
||||
# F5-TTS hat ein 12s Hard-Limit fuer Referenz-Audio. Laengere WAVs werden
|
||||
# vom Modell stumm abgeschnitten — aber unser ref_text bleibt lang und passt
|
||||
# dann nicht mehr zum gekuerzten Audio (Quali leidet, warmup-Render ist
|
||||
# unnoetig lange). Wir clippen explizit auf 10s + re-transkribieren den Text
|
||||
# damit beide synchron bleiben.
|
||||
REF_MAX_SECONDS = 10.0
|
||||
|
||||
# Wird in einer Uebergangsphase als "ungueltige Referenz" erkannt (alte voices,
|
||||
# die hochgeladen wurden bevor die whisper-bridge online war). Bei Erkennung
|
||||
# loeschen wir die .txt und ziehen den echten Text nach.
|
||||
_LEGACY_PLACEHOLDER_REF = "Das ist ein Referenz Audio."
|
||||
|
||||
# ── Lazy F5-TTS Loader ──────────────────────────────────────
|
||||
|
||||
_F5TTS_cls = None
|
||||
|
||||
|
||||
def _get_f5tts_cls():
|
||||
"""Lazy import damit Startup-Logs nicht durch Torch-Warnungen zumuellen."""
|
||||
global _F5TTS_cls
|
||||
if _F5TTS_cls is None:
|
||||
from f5_tts.api import F5TTS as _cls
|
||||
_F5TTS_cls = _cls
|
||||
return _F5TTS_cls
|
||||
|
||||
|
||||
def _resolve_hf_path(p: str) -> str:
|
||||
"""Wenn p mit 'hf://' anfaengt → aus HuggingFace Hub runterladen,
|
||||
lokalen Pfad zurueckgeben. Sonst unveraendert.
|
||||
|
||||
Format: hf://user/repo/path/to/file.ext
|
||||
Beispiel: hf://aihpi/F5-TTS-German/F5TTS_Base/model_365000.safetensors
|
||||
"""
|
||||
if not p or not p.startswith("hf://"):
|
||||
return p
|
||||
try:
|
||||
from huggingface_hub import hf_hub_download
|
||||
rest = p[5:]
|
||||
parts = rest.split("/", 2)
|
||||
if len(parts) < 3:
|
||||
logger.warning("Ungueltiges hf:// Format: %s (erwarte hf://user/repo/path)", p)
|
||||
return p
|
||||
repo_id = f"{parts[0]}/{parts[1]}"
|
||||
filename = parts[2]
|
||||
logger.info("HF-Download: %s aus %s", filename, repo_id)
|
||||
local = hf_hub_download(repo_id=repo_id, filename=filename)
|
||||
logger.info("HF-Download fertig: %s", local)
|
||||
return local
|
||||
except Exception as e:
|
||||
logger.exception("HF-Download fehlgeschlagen fuer %s: %s", p, e)
|
||||
return p
|
||||
|
||||
|
||||
class F5Runner:
|
||||
"""Haelt das F5-TTS-Modell. Synthese laeuft im Executor (blocking).
|
||||
|
||||
Live-Settings (Modell, cfg_strength, nfe_step) werden ueber update_config()
|
||||
aus dem Diagnostic-Config-Broadcast gesetzt; bei Modell-Wechsel wird
|
||||
automatisch neu geladen.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.model = None
|
||||
self._lock = asyncio.Lock()
|
||||
# Aktuelle Werte — gestartet mit Hard-Defaults, ueberschrieben von Diagnostic
|
||||
self.model_id: str = DEFAULT_F5TTS_MODEL
|
||||
self.ckpt_file: str = DEFAULT_F5TTS_CKPT_FILE
|
||||
self.vocab_file: str = DEFAULT_F5TTS_VOCAB_FILE
|
||||
self.cfg_strength: float = DEFAULT_F5TTS_CFG_STRENGTH
|
||||
self.nfe_step: int = DEFAULT_F5TTS_NFE_STEP
|
||||
# Last load-time fuer service_status Broadcast
|
||||
self.last_load_seconds: float = 0.0
|
||||
self._load_started_at: float = 0.0
|
||||
|
||||
def _load_blocking(self) -> None:
|
||||
cls = _get_f5tts_cls()
|
||||
ckpt_resolved = _resolve_hf_path(self.ckpt_file) if self.ckpt_file else ""
|
||||
vocab_resolved = _resolve_hf_path(self.vocab_file) if self.vocab_file else ""
|
||||
logger.info("Lade F5-TTS '%s' (device=%s, ckpt=%s)...",
|
||||
self.model_id, F5TTS_DEVICE, ckpt_resolved or "default")
|
||||
self._load_started_at = time.time()
|
||||
kwargs = {"model": self.model_id, "device": F5TTS_DEVICE}
|
||||
if ckpt_resolved:
|
||||
kwargs["ckpt_file"] = ckpt_resolved
|
||||
if vocab_resolved:
|
||||
kwargs["vocab_file"] = vocab_resolved
|
||||
self.model = cls(**kwargs)
|
||||
elapsed = time.time() - self._load_started_at
|
||||
logger.info("F5-TTS geladen in %.1fs (cfg_strength=%.1f, nfe=%d)",
|
||||
elapsed, self.cfg_strength, self.nfe_step)
|
||||
# Wird von outside (run_loop) gelesen um service_status auf 'ready' zu setzen
|
||||
self.last_load_seconds = elapsed
|
||||
|
||||
async def ensure_loaded(self) -> None:
|
||||
async with self._lock:
|
||||
if self.model is not None:
|
||||
return
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, self._load_blocking)
|
||||
|
||||
async def update_config(self, payload: dict) -> None:
|
||||
"""Liest f5tts*-Felder aus einem config-Broadcast.
|
||||
Bei Modell-relevantem Wechsel wird neu geladen.
|
||||
|
||||
Semantik:
|
||||
- key fehlt in payload → aktuellen Wert behalten
|
||||
- key da, nicht-leerer str → diesen Wert nehmen
|
||||
- key da, leerer string → RESET auf Hard-Default (User hat Feld
|
||||
in Diagnostic geleert und Apply geklickt)
|
||||
"""
|
||||
if "f5ttsModel" in payload:
|
||||
v = (payload.get("f5ttsModel") or "").strip()
|
||||
new_model = v if v else DEFAULT_F5TTS_MODEL
|
||||
else:
|
||||
new_model = self.model_id
|
||||
|
||||
if "f5ttsCkptFile" in payload:
|
||||
v = payload.get("f5ttsCkptFile") or ""
|
||||
new_ckpt = v.strip() if isinstance(v, str) else ""
|
||||
else:
|
||||
new_ckpt = self.ckpt_file
|
||||
|
||||
if "f5ttsVocabFile" in payload:
|
||||
v = payload.get("f5ttsVocabFile") or ""
|
||||
new_vocab = v.strip() if isinstance(v, str) else ""
|
||||
else:
|
||||
new_vocab = self.vocab_file
|
||||
try:
|
||||
new_cfg = float(payload.get("f5ttsCfgStrength", self.cfg_strength))
|
||||
except (TypeError, ValueError):
|
||||
new_cfg = self.cfg_strength
|
||||
try:
|
||||
new_nfe = int(payload.get("f5ttsNfeStep", self.nfe_step))
|
||||
except (TypeError, ValueError):
|
||||
new_nfe = self.nfe_step
|
||||
|
||||
# Settings die KEINEN Modell-Reload brauchen (zur naechsten Synthese aktiv)
|
||||
self.cfg_strength = new_cfg
|
||||
self.nfe_step = new_nfe
|
||||
|
||||
# Settings die einen Reload triggern
|
||||
model_changed = (new_model != self.model_id
|
||||
or new_ckpt != self.ckpt_file
|
||||
or new_vocab != self.vocab_file)
|
||||
if model_changed:
|
||||
logger.info("F5-TTS Config-Wechsel: model=%s ckpt=%s vocab=%s — Reload",
|
||||
new_model, new_ckpt or "default", new_vocab or "default")
|
||||
self.model_id = new_model
|
||||
self.ckpt_file = new_ckpt
|
||||
self.vocab_file = new_vocab
|
||||
async with self._lock:
|
||||
old = self.model
|
||||
self.model = None
|
||||
# Alte Instanz freigeben
|
||||
try:
|
||||
if old is not None:
|
||||
del old
|
||||
except Exception:
|
||||
pass
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, self._load_blocking)
|
||||
else:
|
||||
logger.info("F5-TTS Live-Config: cfg_strength=%.2f nfe=%d", new_cfg, new_nfe)
|
||||
|
||||
def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str,
|
||||
speed: float = 1.0) -> tuple[np.ndarray, int]:
|
||||
logger.info("infer() text=%d chars, speed=%.2f, cfg=%.2f, nfe=%d",
|
||||
len(gen_text), speed, self.cfg_strength, self.nfe_step)
|
||||
wav, sr, _ = self.model.infer(
|
||||
ref_file=ref_wav,
|
||||
ref_text=ref_text,
|
||||
gen_text=gen_text,
|
||||
remove_silence=True,
|
||||
seed=-1,
|
||||
cfg_strength=self.cfg_strength,
|
||||
nfe_step=self.nfe_step,
|
||||
speed=speed,
|
||||
)
|
||||
# F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard
|
||||
if not isinstance(wav, np.ndarray):
|
||||
wav = np.asarray(wav, dtype=np.float32)
|
||||
if wav.ndim > 1:
|
||||
wav = wav.squeeze()
|
||||
return wav.astype(np.float32), int(sr)
|
||||
|
||||
async def synthesize(self, gen_text: str, ref_wav: str, ref_text: str,
|
||||
speed: float = 1.0) -> tuple[np.ndarray, int]:
|
||||
await self.ensure_loaded()
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(None, self._infer_blocking, gen_text, ref_wav, ref_text, speed)
|
||||
|
||||
|
||||
# ── Helpers ─────────────────────────────────────────────────
|
||||
|
||||
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+|\n+")
|
||||
|
||||
|
||||
def split_sentences(text: str, max_len: int = 350) -> list[str]:
|
||||
"""Teilt langen Text an Satzgrenzen. Kurze Texte bleiben als-is."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
if len(text) <= max_len:
|
||||
return [text]
|
||||
parts = [p.strip() for p in _SENTENCE_SPLIT.split(text) if p.strip()]
|
||||
# Zu kurze Fragmente mergen damit F5-TTS nicht an jedem Komma neu startet
|
||||
merged: list[str] = []
|
||||
buf = ""
|
||||
for p in parts:
|
||||
if len(buf) + len(p) + 1 <= max_len:
|
||||
buf = f"{buf} {p}".strip()
|
||||
else:
|
||||
if buf:
|
||||
merged.append(buf)
|
||||
buf = p
|
||||
if buf:
|
||||
merged.append(buf)
|
||||
return merged or [text]
|
||||
|
||||
|
||||
def float_to_pcm16(wav: np.ndarray) -> bytes:
|
||||
"""Float32 (-1..+1) → int16 little-endian bytes.
|
||||
|
||||
F5-TTS generiert gelegentlich NaN/Inf bei Instabilitaeten — ohne sanitize
|
||||
waere der Cast zu int16 undefiniert (RuntimeWarning + kaputter Sound).
|
||||
"""
|
||||
nan_count = int(np.isnan(wav).sum() + np.isinf(wav).sum())
|
||||
if nan_count > 0:
|
||||
logger.warning("F5-TTS Output enthaelt %d NaN/Inf samples — ersetze mit 0", nan_count)
|
||||
wav = np.nan_to_num(wav, nan=0.0, posinf=1.0, neginf=-1.0)
|
||||
wav = np.clip(wav, -1.0, 1.0)
|
||||
pcm = (wav * 32767.0).astype(np.int16)
|
||||
return pcm.tobytes()
|
||||
|
||||
|
||||
def sanitize_voice_name(name: str) -> str:
|
||||
return re.sub(r"[^a-zA-Z0-9_-]", "_", name)
|
||||
|
||||
|
||||
def voice_paths(name: str) -> tuple[Path, Path]:
|
||||
safe = sanitize_voice_name(name)
|
||||
return VOICES_DIR / f"{safe}.wav", VOICES_DIR / f"{safe}.txt"
|
||||
|
||||
|
||||
def normalize_ref_wav(src_wav: Path, max_seconds: float = REF_MAX_SECONDS) -> tuple[Path, bool]:
|
||||
"""Bringt die Referenz-WAV in F5-TTS-freundliche Form:
|
||||
|
||||
* 24kHz mono
|
||||
* max max_seconds Dauer
|
||||
* Stille am Anfang + Ende abgeschnitten (silenceremove-Filter)
|
||||
* Lautheit auf -16 LUFS normalisiert (loudnorm-Filter) damit
|
||||
das Modell konsistente Amplituden sieht
|
||||
|
||||
F5-TTS reagiert empfindlich auf leise / verrauschte / zerhackte
|
||||
Referenzen. Konsistente, saubere Input-Lautheit hilft der Quali.
|
||||
|
||||
Returns:
|
||||
(path, was_modified) — was_modified=True wenn die Datei wirklich
|
||||
geaendert wurde (Caller sollte dann den passenden .txt invalidieren).
|
||||
"""
|
||||
tmp_out = src_wav.with_suffix(".conv.wav")
|
||||
# silenceremove am Anfang: bis -50dB gesprochen wird
|
||||
# silenceremove am Ende: ueber -50dB rein, dann 0.5s stille als Cutoff
|
||||
# loudnorm: EBU R128, Ziel -16 LUFS
|
||||
af = ("silenceremove=start_periods=1:start_duration=0.05:start_threshold=-50dB,"
|
||||
"silenceremove=stop_periods=1:stop_duration=0.5:stop_threshold=-50dB,"
|
||||
"loudnorm=I=-16:TP=-1.5:LRA=11")
|
||||
cmd = ["ffmpeg", "-y", "-i", str(src_wav),
|
||||
"-af", af,
|
||||
"-ar", str(TARGET_SR), "-ac", "1",
|
||||
"-t", str(max_seconds),
|
||||
"-f", "wav", str(tmp_out)]
|
||||
r = subprocess.run(cmd, capture_output=True, timeout=30)
|
||||
if r.returncode != 0:
|
||||
logger.warning("ffmpeg-Normalisierung von %s fehlgeschlagen: %s",
|
||||
src_wav, r.stderr.decode(errors="replace")[:300])
|
||||
try:
|
||||
tmp_out.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
return src_wav, False
|
||||
os.replace(tmp_out, src_wav)
|
||||
try:
|
||||
info = sf.info(str(src_wav))
|
||||
logger.info("Referenz-WAV normalisiert: %s (%.1fs, %dHz mono, -16 LUFS, silence getrimmt)",
|
||||
src_wav.name, info.duration, info.samplerate)
|
||||
except Exception:
|
||||
logger.info("Referenz-WAV normalisiert: %s", src_wav.name)
|
||||
return src_wav, True
|
||||
|
||||
|
||||
async def _send(ws, mtype: str, payload: dict) -> None:
|
||||
try:
|
||||
await ws.send(json.dumps({
|
||||
"type": mtype,
|
||||
"payload": payload,
|
||||
"timestamp": int(time.time() * 1000),
|
||||
}))
|
||||
except Exception as e:
|
||||
logger.warning("Send fehlgeschlagen (%s): %s", mtype, e)
|
||||
|
||||
|
||||
# ── Interne Transkription via whisper-bridge ────────────────
|
||||
|
||||
_pending_stt: dict[str, asyncio.Future] = {}
|
||||
_STT_TIMEOUT_S = 60.0
|
||||
|
||||
|
||||
async def request_transcription(ws, wav_path: Path, language: str = "de") -> Optional[str]:
|
||||
"""Sendet einen stt_request an die whisper-bridge (ueber RVS) und wartet auf stt_response."""
|
||||
try:
|
||||
with open(wav_path, "rb") as f:
|
||||
audio_b64 = base64.b64encode(f.read()).decode("ascii")
|
||||
except Exception as e:
|
||||
logger.error("Lesen %s fehlgeschlagen: %s", wav_path, e)
|
||||
return None
|
||||
|
||||
request_id = str(uuid.uuid4())
|
||||
loop = asyncio.get_event_loop()
|
||||
fut: asyncio.Future = loop.create_future()
|
||||
_pending_stt[request_id] = fut
|
||||
|
||||
try:
|
||||
await _send(ws, "stt_request", {
|
||||
"requestId": request_id,
|
||||
"audio": audio_b64,
|
||||
"mimeType": "audio/wav",
|
||||
# KEIN hardcoded model — whisper-bridge nimmt das bereits
|
||||
# geladene. Sonst wuerde hier ein Swap auf 'small' passieren und
|
||||
# danach muesste das in Diagnostic konfigurierte Modell (z.B.
|
||||
# large-v3) wieder geladen werden → doppelter Download.
|
||||
"language": language,
|
||||
})
|
||||
return await asyncio.wait_for(fut, timeout=_STT_TIMEOUT_S)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("Transkription Timeout fuer %s", wav_path.name)
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("Transkription Fehler: %s", e)
|
||||
return None
|
||||
finally:
|
||||
_pending_stt.pop(request_id, None)
|
||||
|
||||
|
||||
# ── TTS-Request Handler ─────────────────────────────────────
|
||||
|
||||
# Queue damit sich parallele Requests nicht ueberlappen (GPU-Throughput)
|
||||
_tts_queue: asyncio.Queue[tuple] = asyncio.Queue()
|
||||
|
||||
|
||||
async def _tts_worker(ws, runner: F5Runner) -> None:
|
||||
"""Serialisiert Synthesen — GPU kann sonst OOM gehen."""
|
||||
while True:
|
||||
text, voice, request_id, message_id, language, speed = await _tts_queue.get()
|
||||
try:
|
||||
await _do_tts(ws, runner, text, voice, request_id, message_id, language, speed)
|
||||
except Exception:
|
||||
logger.exception("TTS-Worker Fehler")
|
||||
finally:
|
||||
_tts_queue.task_done()
|
||||
|
||||
|
||||
async def _do_tts(ws, runner: F5Runner, text: str, voice: str,
|
||||
request_id: str, message_id: str, language: str,
|
||||
speed: float = 1.0) -> None:
|
||||
t0 = time.time()
|
||||
ref_wav_path, ref_txt_path = voice_paths(voice) if voice else (None, None)
|
||||
|
||||
# WAV zu lang? F5-TTS limitiert intern auf 12s, dann passt der txt nicht
|
||||
# mehr zum Audio. Wir clippen explizit auf 10s und invalidieren den txt,
|
||||
# damit er on-the-fly passend zum gekuerzten Audio neu transkribiert wird.
|
||||
if voice and ref_wav_path and ref_wav_path.exists():
|
||||
try:
|
||||
info = sf.info(str(ref_wav_path))
|
||||
if info.duration > REF_MAX_SECONDS + 0.5:
|
||||
logger.info("Voice '%s' WAV ist %.1fs (>%.0fs) → clippen + txt neu",
|
||||
voice, info.duration, REF_MAX_SECONDS)
|
||||
_, modified = normalize_ref_wav(ref_wav_path)
|
||||
if modified and ref_txt_path and ref_txt_path.exists():
|
||||
ref_txt_path.unlink()
|
||||
except Exception as e:
|
||||
logger.warning("Konnte WAV-Dauer nicht pruefen: %s", e)
|
||||
|
||||
# Legacy-Platzhalter erkennen → behandeln als "kein txt" und neu transkribieren
|
||||
if voice and ref_txt_path and ref_txt_path.exists():
|
||||
try:
|
||||
existing = ref_txt_path.read_text(encoding="utf-8").strip()
|
||||
if existing == _LEGACY_PLACEHOLDER_REF or not existing:
|
||||
logger.info("Voice '%s' hat Legacy-Platzhalter → loesche, transkribiere neu", voice)
|
||||
ref_txt_path.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
has_custom = bool(voice and ref_wav_path and ref_wav_path.exists() and ref_txt_path.exists())
|
||||
if voice and not has_custom:
|
||||
# Wenn nur WAV da ist aber kein txt → on-the-fly transkribieren
|
||||
if ref_wav_path and ref_wav_path.exists() and (not ref_txt_path or not ref_txt_path.exists()):
|
||||
logger.info("Voice '%s' hat kein txt — transkribiere on-the-fly", voice)
|
||||
text_ref = await request_transcription(ws, ref_wav_path, language)
|
||||
if text_ref and text_ref.strip():
|
||||
try:
|
||||
ref_txt_path.write_text(text_ref.strip(), encoding="utf-8")
|
||||
has_custom = True
|
||||
logger.info("Referenz-Text nachgezogen: '%s'", text_ref[:60])
|
||||
except Exception as e:
|
||||
logger.warning("Referenz-Text speichern fehlgeschlagen: %s", e)
|
||||
if not has_custom:
|
||||
logger.warning("Voice '%s' nicht komplett (%s, txt=%s) — nehme Default",
|
||||
voice, ref_wav_path, (ref_txt_path and ref_txt_path.exists()))
|
||||
|
||||
if has_custom:
|
||||
ref_wav_str = str(ref_wav_path)
|
||||
ref_text = ref_txt_path.read_text(encoding="utf-8").strip()
|
||||
else:
|
||||
# Fallback: kein Custom-Voice. F5-TTS braucht IMMER eine Referenz,
|
||||
# wir nehmen default_ref.wav/txt falls vorhanden, sonst die erste
|
||||
# gefundene Voice im Ordner.
|
||||
default_wav = VOICES_DIR / "default_ref.wav"
|
||||
default_txt = VOICES_DIR / "default_ref.txt"
|
||||
if default_wav.exists() and default_txt.exists():
|
||||
ref_wav_str = str(default_wav)
|
||||
ref_text = default_txt.read_text(encoding="utf-8").strip()
|
||||
else:
|
||||
# Nimm irgendein vorhandenes voice-Paar
|
||||
pair = next(
|
||||
((w, t) for w, t in (
|
||||
(v, v.with_suffix(".txt")) for v in VOICES_DIR.glob("*.wav")
|
||||
) if t.exists()),
|
||||
None,
|
||||
)
|
||||
if not pair:
|
||||
logger.error("Keine Referenz-Stimme im VOICES_DIR — TTS abgebrochen")
|
||||
return
|
||||
ref_wav_str, ref_text = str(pair[0]), pair[1].read_text(encoding="utf-8").strip()
|
||||
|
||||
sentences = split_sentences(text)
|
||||
logger.info("F5-TTS: %d Satz(e), voice=%s, speed=%.2fx (%s)",
|
||||
len(sentences), voice or "default", speed, ref_wav_str)
|
||||
|
||||
chunk_index = 0
|
||||
pcm_sr = TARGET_SR
|
||||
for i, sent in enumerate(sentences):
|
||||
try:
|
||||
wav, sr = await runner.synthesize(sent, ref_wav_str, ref_text, speed)
|
||||
pcm_sr = sr
|
||||
pcm_bytes = float_to_pcm16(wav)
|
||||
# Erste PCM-Chunk des allerersten Satzes bekommt Fade-In (maskiert
|
||||
# eventuelle Warmup-Glitches). Alle anderen Chunks bleiben wie sind.
|
||||
if i == 0 and chunk_index == 0:
|
||||
pcm_bytes = _fade_in_pcm16(pcm_bytes, sr, 120)
|
||||
|
||||
# Stueckeln
|
||||
for off in range(0, len(pcm_bytes), PCM_CHUNK_BYTES):
|
||||
slice_ = pcm_bytes[off:off + PCM_CHUNK_BYTES]
|
||||
await _send(ws, "audio_pcm", {
|
||||
"requestId": request_id,
|
||||
"messageId": message_id,
|
||||
"base64": base64.b64encode(slice_).decode("ascii"),
|
||||
"format": "pcm_s16le",
|
||||
"sampleRate": sr,
|
||||
"channels": 1,
|
||||
"voice": voice or "default",
|
||||
"chunk": chunk_index,
|
||||
"final": False,
|
||||
})
|
||||
chunk_index += 1
|
||||
except Exception as e:
|
||||
logger.exception("F5-TTS Synthese-Fehler (Satz %d)", i)
|
||||
await _send(ws, "xtts_response", {
|
||||
"requestId": request_id,
|
||||
"error": str(e)[:200],
|
||||
})
|
||||
return
|
||||
|
||||
# Final-Marker
|
||||
await _send(ws, "audio_pcm", {
|
||||
"requestId": request_id,
|
||||
"messageId": message_id,
|
||||
"base64": "",
|
||||
"format": "pcm_s16le",
|
||||
"sampleRate": pcm_sr,
|
||||
"channels": 1,
|
||||
"voice": voice or "default",
|
||||
"chunk": chunk_index,
|
||||
"final": True,
|
||||
})
|
||||
|
||||
logger.info("TTS komplett: %d Chunks, %.2fs render (voice=%s, text=%d chars)",
|
||||
chunk_index, time.time() - t0, voice or "default", len(text))
|
||||
|
||||
|
||||
def _fade_in_pcm16(pcm: bytes, sr: int, fade_ms: int) -> bytes:
|
||||
"""Linear Fade-In auf erste fade_ms — maskiert Warmup-Glitches."""
|
||||
arr = np.frombuffer(pcm, dtype=np.int16).copy()
|
||||
fade_samples = min(int((fade_ms / 1000.0) * sr), len(arr))
|
||||
if fade_samples <= 0:
|
||||
return pcm
|
||||
ramp = np.linspace(0.0, 1.0, fade_samples, dtype=np.float32)
|
||||
arr[:fade_samples] = (arr[:fade_samples].astype(np.float32) * ramp).astype(np.int16)
|
||||
return arr.tobytes()
|
||||
|
||||
|
||||
# ── Voice Management Handlers ───────────────────────────────
|
||||
|
||||
async def handle_voice_upload(ws, payload: dict) -> None:
|
||||
name = (payload.get("name") or "").strip()
|
||||
samples = payload.get("samples") or []
|
||||
if not name or not samples:
|
||||
logger.warning("voice_upload: ungueltig (name=%r, samples=%d)", name, len(samples))
|
||||
return
|
||||
logger.info("Voice-Upload: '%s' (%d Samples)", name, len(samples))
|
||||
|
||||
try:
|
||||
VOICES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
safe = sanitize_voice_name(name)
|
||||
wav_path = VOICES_DIR / f"{safe}.wav"
|
||||
txt_path = VOICES_DIR / f"{safe}.txt"
|
||||
|
||||
# Samples zusammenfuegen
|
||||
buffers = [base64.b64decode(s.get("base64", "")) for s in samples]
|
||||
with open(wav_path, "wb") as f:
|
||||
for b in buffers:
|
||||
f.write(b)
|
||||
size_kb = wav_path.stat().st_size / 1024
|
||||
logger.info("Voice WAV gespeichert: %s (%.0fKB)", wav_path, size_kb)
|
||||
|
||||
# Auf 24kHz mono clippen auf 10s (F5-TTS Hard-Limit ist 12s,
|
||||
# kuerzer = schnellerer Warmup + Text+Audio bleiben aligned)
|
||||
normalize_ref_wav(wav_path)
|
||||
|
||||
# Transkription ueber whisper-bridge anfragen
|
||||
logger.info("Transkribiere '%s' via whisper-bridge...", name)
|
||||
text = await request_transcription(ws, wav_path, language="de")
|
||||
if text and text.strip():
|
||||
txt_path.write_text(text.strip(), encoding="utf-8")
|
||||
logger.info("Voice '%s' komplett (txt: %s)", name, text[:80])
|
||||
ref_text_for_response = text.strip()
|
||||
else:
|
||||
# KEIN Platzhalter mehr schreiben! Beim ersten echten TTS-Use wird
|
||||
# on-the-fly nachtranskribiert. Wenn die whisper-bridge dann online
|
||||
# ist, klappt's — sonst koennte der User die .txt manuell anlegen.
|
||||
logger.warning("Voice '%s': Transkription fehlgeschlagen — .txt bleibt leer, "
|
||||
"wird on-the-fly bei erstem Render nachgezogen", name)
|
||||
ref_text_for_response = ""
|
||||
|
||||
await _send(ws, "xtts_voice_saved", {
|
||||
"name": name, "size": int(size_kb * 1024), "refText": ref_text_for_response,
|
||||
})
|
||||
# Liste aktualisieren
|
||||
await handle_list_voices(ws)
|
||||
except Exception as e:
|
||||
logger.exception("voice_upload Fehler")
|
||||
await _send(ws, "xtts_voice_saved", {"name": name, "error": str(e)[:200]})
|
||||
|
||||
|
||||
async def handle_list_voices(ws) -> None:
|
||||
try:
|
||||
voices = []
|
||||
if VOICES_DIR.exists():
|
||||
for wav in sorted(VOICES_DIR.glob("*.wav")):
|
||||
txt = wav.with_suffix(".txt")
|
||||
voices.append({
|
||||
"name": wav.stem,
|
||||
"file": wav.name,
|
||||
"size": wav.stat().st_size,
|
||||
"hasRefText": txt.exists(),
|
||||
})
|
||||
logger.info("Stimmen-Liste: %d", len(voices))
|
||||
await _send(ws, "xtts_voices_list", {"voices": voices})
|
||||
except Exception:
|
||||
logger.exception("handle_list_voices Fehler")
|
||||
|
||||
|
||||
async def handle_delete_voice(ws, payload: dict) -> None:
|
||||
name = (payload.get("name") or "").strip()
|
||||
if not name:
|
||||
return
|
||||
try:
|
||||
wav, txt = voice_paths(name)
|
||||
for p in (wav, txt):
|
||||
if p.exists():
|
||||
p.unlink()
|
||||
logger.info("Voice geloescht: %s", p)
|
||||
await handle_list_voices(ws)
|
||||
except Exception:
|
||||
logger.exception("handle_delete_voice Fehler")
|
||||
|
||||
|
||||
async def handle_export_voice(ws, payload: dict) -> None:
|
||||
"""Packt eine Stimme (.wav + .txt) als tar.gz und sendet sie base64 zurueck."""
|
||||
name = (payload.get("name") or "").strip()
|
||||
if not name:
|
||||
await _send(ws, "xtts_voice_exported", {"ok": False, "error": "name fehlt"})
|
||||
return
|
||||
try:
|
||||
wav, txt = voice_paths(name)
|
||||
if not wav.exists():
|
||||
await _send(ws, "xtts_voice_exported", {"ok": False, "name": name, "error": "Stimme nicht gefunden"})
|
||||
return
|
||||
import io, tarfile
|
||||
buf = io.BytesIO()
|
||||
with tarfile.open(fileobj=buf, mode="w:gz") as tar:
|
||||
tar.add(wav, arcname=wav.name)
|
||||
if txt.exists():
|
||||
tar.add(txt, arcname=txt.name)
|
||||
data = base64.b64encode(buf.getvalue()).decode("ascii")
|
||||
logger.info("Voice exportiert: %s (%d KB tar.gz)", name, len(buf.getvalue()) // 1024)
|
||||
await _send(ws, "xtts_voice_exported", {"ok": True, "name": name, "data": data})
|
||||
except Exception as e:
|
||||
logger.exception("handle_export_voice Fehler")
|
||||
await _send(ws, "xtts_voice_exported", {"ok": False, "name": name, "error": str(e)[:200]})
|
||||
|
||||
|
||||
async def handle_import_voice(ws, payload: dict) -> None:
|
||||
"""Empfaengt eine tar.gz mit <name>.wav (+ optional <name>.txt) und legt
|
||||
sie in VOICES_DIR ab. Ueberschreibt bestehende Stimme gleichen Namens."""
|
||||
name = (payload.get("name") or "").strip()
|
||||
data_b64 = payload.get("data") or ""
|
||||
if not name or not data_b64:
|
||||
await _send(ws, "xtts_voice_imported", {"ok": False, "error": "name/data fehlt"})
|
||||
return
|
||||
try:
|
||||
import io, tarfile
|
||||
VOICES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
safe = sanitize_voice_name(name)
|
||||
data = base64.b64decode(data_b64)
|
||||
extracted_wav = False
|
||||
with tarfile.open(fileobj=io.BytesIO(data), mode="r:gz") as tar:
|
||||
for member in tar.getmembers():
|
||||
if not member.isfile():
|
||||
continue
|
||||
base = Path(member.name).name # Path-Traversal verhindern
|
||||
if base.lower().endswith(".wav"):
|
||||
target = VOICES_DIR / f"{safe}.wav"
|
||||
f = tar.extractfile(member)
|
||||
if f is None:
|
||||
continue
|
||||
with open(target, "wb") as out:
|
||||
out.write(f.read())
|
||||
extracted_wav = True
|
||||
elif base.lower().endswith(".txt"):
|
||||
target = VOICES_DIR / f"{safe}.txt"
|
||||
f = tar.extractfile(member)
|
||||
if f is None:
|
||||
continue
|
||||
with open(target, "wb") as out:
|
||||
out.write(f.read())
|
||||
if not extracted_wav:
|
||||
await _send(ws, "xtts_voice_imported", {"ok": False, "name": name, "error": "Kein .wav im Archiv"})
|
||||
return
|
||||
logger.info("Voice importiert: %s", name)
|
||||
await _send(ws, "xtts_voice_imported", {"ok": True, "name": name})
|
||||
await handle_list_voices(ws)
|
||||
except Exception as e:
|
||||
logger.exception("handle_import_voice Fehler")
|
||||
await _send(ws, "xtts_voice_imported", {"ok": False, "name": name, "error": str(e)[:200]})
|
||||
|
||||
|
||||
# Letzte diagnostisch-gesetzte Voice (verhindert Endlos-Preload bei jedem config)
|
||||
_last_diag_voice = ""
|
||||
|
||||
|
||||
async def handle_voice_preload(ws, payload: dict, runner: F5Runner) -> None:
|
||||
voice = (payload.get("voice") or "").strip()
|
||||
request_id = payload.get("requestId", "")
|
||||
logger.info("Voice-Preload angefordert: '%s'", voice or "default")
|
||||
|
||||
try:
|
||||
ref_wav, ref_txt = voice_paths(voice) if voice else (None, None)
|
||||
if voice and (not ref_wav or not ref_wav.exists()):
|
||||
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "error": "voice-file-not-found"})
|
||||
return
|
||||
|
||||
# Ref-Text sicherstellen (falls nur WAV da ist)
|
||||
if voice and ref_txt and not ref_txt.exists():
|
||||
text = await request_transcription(ws, ref_wav, language="de")
|
||||
if text:
|
||||
ref_txt.write_text(text.strip(), encoding="utf-8")
|
||||
logger.info("Referenz-Text beim Preload nachgezogen")
|
||||
|
||||
# Dummy-Render zum Warmup
|
||||
t0 = time.time()
|
||||
await _do_tts(ws, runner, "ja.", voice, f"preload-{request_id}", "", "de")
|
||||
ms = int((time.time() - t0) * 1000)
|
||||
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "loadMs": ms})
|
||||
except Exception as e:
|
||||
logger.exception("Voice-Preload Fehler")
|
||||
await _send(ws, "voice_ready", {"voice": voice, "requestId": request_id, "error": str(e)[:200]})
|
||||
|
||||
|
||||
# ── Haupt-Loop ──────────────────────────────────────────────
|
||||
|
||||
async def _broadcast_status(ws, state: str, **extra) -> None:
|
||||
"""Sendet service_status fuer das F5-TTS Modul.
|
||||
state: 'loading' | 'ready' | 'error'."""
|
||||
payload = {"service": "f5tts", "state": state}
|
||||
payload.update(extra)
|
||||
await _send(ws, "service_status", payload)
|
||||
|
||||
|
||||
async def run_loop(runner: F5Runner) -> None:
|
||||
use_tls = RVS_TLS
|
||||
retry_s = 2
|
||||
tls_fallback_tried = False
|
||||
global _last_diag_voice
|
||||
|
||||
while True:
|
||||
scheme = "wss" if use_tls else "ws"
|
||||
url = f"{scheme}://{RVS_HOST}:{RVS_PORT}/ws?token={RVS_TOKEN}"
|
||||
masked = url.replace(RVS_TOKEN, "***") if RVS_TOKEN else url
|
||||
|
||||
try:
|
||||
logger.info("Verbinde zu RVS: %s", masked)
|
||||
async with websockets.connect(url, ping_interval=20, ping_timeout=10, max_size=50 * 1024 * 1024) as ws:
|
||||
logger.info("RVS verbunden")
|
||||
retry_s = 2
|
||||
tls_fallback_tried = False
|
||||
|
||||
# Status-Broadcast: erst loading, dann ready nach erfolgreichem Load.
|
||||
# Plus: config_request damit wir die persistierte Diagnostic-Config
|
||||
# bekommen, falls aria-bridge ihre nicht von alleine sendet.
|
||||
async def _load_with_status():
|
||||
try:
|
||||
if runner.model is not None:
|
||||
logger.info("Initial: broadcaste ready (Modell schon im RAM: %s)", runner.model_id)
|
||||
await _broadcast_status(ws, "ready",
|
||||
model=runner.model_id,
|
||||
loadSeconds=runner.last_load_seconds)
|
||||
else:
|
||||
logger.info("Initial: broadcaste loading + lade Modell '%s'", runner.model_id)
|
||||
await _broadcast_status(ws, "loading", model=runner.model_id)
|
||||
await runner.ensure_loaded()
|
||||
await _broadcast_status(ws, "ready",
|
||||
model=runner.model_id,
|
||||
loadSeconds=runner.last_load_seconds)
|
||||
logger.info("Initial: sende config_request an aria-bridge")
|
||||
await _send(ws, "config_request", {"service": "f5tts"})
|
||||
except Exception as e:
|
||||
logger.exception("Initial-Load crashed: %s", e)
|
||||
try:
|
||||
await _broadcast_status(ws, "error", error=str(e)[:200])
|
||||
except Exception:
|
||||
pass
|
||||
asyncio.create_task(_load_with_status())
|
||||
|
||||
# TTS-Worker fuer diese Verbindung starten
|
||||
worker = asyncio.create_task(_tts_worker(ws, runner))
|
||||
|
||||
try:
|
||||
async for raw in ws:
|
||||
try:
|
||||
msg = json.loads(raw)
|
||||
except Exception:
|
||||
continue
|
||||
mtype = msg.get("type", "")
|
||||
payload = msg.get("payload", {}) or {}
|
||||
|
||||
if mtype == "xtts_request":
|
||||
try:
|
||||
speed = float(payload.get("speed") or 1.0)
|
||||
except (TypeError, ValueError):
|
||||
speed = 1.0
|
||||
if not (0.1 <= speed <= 5.0):
|
||||
speed = 1.0
|
||||
await _tts_queue.put((
|
||||
payload.get("text", ""),
|
||||
payload.get("voice", "") or "",
|
||||
payload.get("requestId", ""),
|
||||
payload.get("messageId", ""),
|
||||
payload.get("language", "de"),
|
||||
speed,
|
||||
))
|
||||
elif mtype == "voice_upload":
|
||||
asyncio.create_task(handle_voice_upload(ws, payload))
|
||||
elif mtype == "xtts_list_voices":
|
||||
asyncio.create_task(handle_list_voices(ws))
|
||||
elif mtype == "xtts_delete_voice":
|
||||
asyncio.create_task(handle_delete_voice(ws, payload))
|
||||
elif mtype == "xtts_export_voice":
|
||||
asyncio.create_task(handle_export_voice(ws, payload))
|
||||
elif mtype == "xtts_import_voice":
|
||||
asyncio.create_task(handle_import_voice(ws, payload))
|
||||
elif mtype == "voice_preload":
|
||||
asyncio.create_task(handle_voice_preload(ws, payload, runner))
|
||||
elif mtype == "stt_response":
|
||||
# Antwort auf unseren internen Transkriptions-Request
|
||||
req_id = payload.get("requestId", "")
|
||||
fut = _pending_stt.get(req_id)
|
||||
if fut and not fut.done():
|
||||
if payload.get("error"):
|
||||
fut.set_result(None)
|
||||
else:
|
||||
fut.set_result(payload.get("text") or "")
|
||||
elif mtype == "config":
|
||||
# F5-TTS-Settings aktualisieren (Modell, cfg_strength, nfe)
|
||||
async def _update_with_status(p):
|
||||
# Schaut ob ein Modell-Wechsel ansteht — falls ja:
|
||||
# erst loading-Status, dann update, dann ready.
|
||||
old_model = (runner.model_id, runner.ckpt_file, runner.vocab_file)
|
||||
new_model_id = (p.get("f5ttsModel") or runner.model_id,
|
||||
p.get("f5ttsCkptFile", runner.ckpt_file) or "",
|
||||
p.get("f5ttsVocabFile", runner.vocab_file) or "")
|
||||
will_reload = old_model != new_model_id
|
||||
if will_reload:
|
||||
await _broadcast_status(ws, "loading", model=new_model_id[0])
|
||||
try:
|
||||
await runner.update_config(p)
|
||||
if will_reload:
|
||||
await _broadcast_status(ws, "ready",
|
||||
model=runner.model_id,
|
||||
loadSeconds=runner.last_load_seconds)
|
||||
except Exception as e:
|
||||
if will_reload:
|
||||
await _broadcast_status(ws, "error", error=str(e)[:200])
|
||||
asyncio.create_task(_update_with_status(payload))
|
||||
# Voice-Preload bei Wechsel
|
||||
v = (payload.get("xttsVoice") or "").strip()
|
||||
if v and v != _last_diag_voice:
|
||||
_last_diag_voice = v
|
||||
asyncio.create_task(handle_voice_preload(
|
||||
ws, {"voice": v, "source": "diagnostic"}, runner,
|
||||
))
|
||||
elif not v:
|
||||
_last_diag_voice = ""
|
||||
finally:
|
||||
worker.cancel()
|
||||
try:
|
||||
await worker
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.warning("Verbindung verloren: %s", e)
|
||||
if use_tls and RVS_TLS_FALLBACK and not tls_fallback_tried:
|
||||
logger.info("TLS fehlgeschlagen — Fallback auf ws://")
|
||||
use_tls = False
|
||||
tls_fallback_tried = True
|
||||
continue
|
||||
await asyncio.sleep(min(retry_s, 30))
|
||||
retry_s = min(retry_s * 2, 30)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
if not RVS_HOST:
|
||||
logger.error("RVS_HOST nicht gesetzt — Abbruch")
|
||||
sys.exit(1)
|
||||
VOICES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
runner = F5Runner()
|
||||
await run_loop(runner)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
@@ -0,0 +1,5 @@
|
||||
f5-tts>=1.0.0
|
||||
websockets>=12.0
|
||||
numpy>=1.24
|
||||
soundfile>=0.12
|
||||
requests>=2.31
|
||||
@@ -1,8 +0,0 @@
|
||||
{
|
||||
"name": "aria-xtts-bridge",
|
||||
"version": "1.0.0",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"ws": "^8.16.0"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY bridge.py .
|
||||
|
||||
CMD ["python3", "bridge.py"]
|
||||
@@ -0,0 +1,309 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ARIA Whisper Bridge — laeuft auf der Gamebox (RTX 3060).
|
||||
|
||||
Empfaengt stt_request via RVS → FFmpeg-Konvertierung → faster-whisper auf GPU
|
||||
→ sendet stt_response zurueck an die aria-bridge.
|
||||
|
||||
Env:
|
||||
RVS_HOST, RVS_PORT, RVS_TLS, RVS_TLS_FALLBACK, RVS_TOKEN
|
||||
WHISPER_MODEL Default: small
|
||||
WHISPER_DEVICE Default: cuda
|
||||
WHISPER_COMPUTE_TYPE Default: float16
|
||||
WHISPER_LANGUAGE Default: de
|
||||
"""
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import websockets
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger("whisper-bridge")
|
||||
|
||||
RVS_HOST = os.getenv("RVS_HOST", "").strip()
|
||||
RVS_PORT = int(os.getenv("RVS_PORT", "443"))
|
||||
RVS_TLS = os.getenv("RVS_TLS", "true").lower() == "true"
|
||||
RVS_TLS_FALLBACK = os.getenv("RVS_TLS_FALLBACK", "true").lower() == "true"
|
||||
RVS_TOKEN = os.getenv("RVS_TOKEN", "").strip()
|
||||
|
||||
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small")
|
||||
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cuda")
|
||||
WHISPER_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "float16")
|
||||
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "de")
|
||||
|
||||
ALLOWED_MODELS = {"tiny", "base", "small", "medium", "large-v3"}
|
||||
|
||||
|
||||
class WhisperRunner:
|
||||
"""Haelt das Whisper-Modell. Hot-Swap bei Konfig-Wechsel via ensure_loaded()."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.model_size: str = WHISPER_MODEL
|
||||
self.model: Optional[WhisperModel] = None
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
def _load_blocking(self, size: str) -> None:
|
||||
logger.info(
|
||||
"Lade Whisper '%s' (device=%s, compute=%s)",
|
||||
size, WHISPER_DEVICE, WHISPER_COMPUTE_TYPE,
|
||||
)
|
||||
t0 = time.time()
|
||||
self.model = WhisperModel(
|
||||
size, device=WHISPER_DEVICE, compute_type=WHISPER_COMPUTE_TYPE,
|
||||
)
|
||||
self.model_size = size
|
||||
logger.info("Whisper '%s' geladen in %.1fs", size, time.time() - t0)
|
||||
|
||||
async def ensure_loaded(self, desired_size: str) -> None:
|
||||
if desired_size not in ALLOWED_MODELS:
|
||||
logger.warning("Ungueltiges Whisper-Modell '%s' — nutze %s", desired_size, WHISPER_MODEL)
|
||||
desired_size = WHISPER_MODEL
|
||||
async with self._lock:
|
||||
if self.model is not None and self.model_size == desired_size:
|
||||
return
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, self._load_blocking, desired_size)
|
||||
|
||||
async def transcribe(self, audio: np.ndarray, language: str) -> tuple[str, float]:
|
||||
if self.model is None:
|
||||
return "", 0.0
|
||||
|
||||
def _run():
|
||||
segments, info = self.model.transcribe(
|
||||
audio, language=language, beam_size=5, vad_filter=True,
|
||||
)
|
||||
text = " ".join(seg.text.strip() for seg in segments)
|
||||
return text, info.duration
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(None, _run)
|
||||
|
||||
|
||||
def ffmpeg_to_float32(audio_b64: str, mime_type: str) -> np.ndarray:
|
||||
"""Dekodiert beliebiges Audio-Format → 16kHz mono float32 PCM."""
|
||||
if "mp4" in mime_type or "m4a" in mime_type or "aac" in mime_type:
|
||||
ext = ".mp4"
|
||||
elif "wav" in mime_type:
|
||||
ext = ".wav"
|
||||
elif "ogg" in mime_type or "opus" in mime_type:
|
||||
ext = ".ogg"
|
||||
else:
|
||||
ext = ".bin"
|
||||
|
||||
in_fh = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
|
||||
try:
|
||||
in_fh.write(base64.b64decode(audio_b64))
|
||||
in_fh.close()
|
||||
out_path = in_fh.name + ".raw"
|
||||
cmd = ["ffmpeg", "-y", "-i", in_fh.name, "-ar", "16000", "-ac", "1", "-f", "f32le", out_path]
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=30)
|
||||
if result.returncode != 0:
|
||||
logger.error("FFmpeg Fehler: %s", result.stderr.decode(errors="replace")[:300])
|
||||
return np.zeros(0, dtype=np.float32)
|
||||
try:
|
||||
return np.fromfile(out_path, dtype=np.float32)
|
||||
finally:
|
||||
try:
|
||||
os.unlink(out_path)
|
||||
except OSError:
|
||||
pass
|
||||
finally:
|
||||
try:
|
||||
os.unlink(in_fh.name)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
async def _send(ws, mtype: str, payload: dict) -> None:
|
||||
try:
|
||||
await ws.send(json.dumps({
|
||||
"type": mtype,
|
||||
"payload": payload,
|
||||
"timestamp": int(time.time() * 1000),
|
||||
}))
|
||||
except Exception as e:
|
||||
logger.warning("Send fehlgeschlagen (%s): %s", mtype, e)
|
||||
|
||||
|
||||
async def handle_stt_request(ws, payload: dict, runner: WhisperRunner) -> None:
|
||||
request_id = payload.get("requestId", "")
|
||||
audio_b64 = payload.get("audio", "")
|
||||
mime_type = payload.get("mimeType", "audio/mp4")
|
||||
# Modell-Auswahl:
|
||||
# payload.model gesetzt → nimm das (aria-bridge sendet's basierend auf Config)
|
||||
# sonst + Modell geladen → behalt das aktuelle (kein sinnloser Swap)
|
||||
# sonst → fallback auf ENV-Default
|
||||
model = payload.get("model") or (runner.model_size if runner.model is not None else WHISPER_MODEL)
|
||||
language = payload.get("language") or WHISPER_LANGUAGE
|
||||
|
||||
if not audio_b64:
|
||||
await _send(ws, "stt_response", {"requestId": request_id, "error": "no-audio"})
|
||||
return
|
||||
|
||||
try:
|
||||
t_load = time.time()
|
||||
# Falls Modell noch nicht geladen (Race-Condition: stt_request vor config)
|
||||
# → Status-Broadcast loading→ready damit der App-Banner aufpoppt
|
||||
needs_load = runner.model is None or runner.model_size != model
|
||||
if needs_load:
|
||||
await _broadcast_status(ws, "loading", model=model)
|
||||
await runner.ensure_loaded(model)
|
||||
load_ms = int((time.time() - t_load) * 1000)
|
||||
if needs_load:
|
||||
await _broadcast_status(ws, "ready",
|
||||
model=runner.model_size,
|
||||
loadSeconds=load_ms / 1000.0)
|
||||
|
||||
audio = ffmpeg_to_float32(audio_b64, mime_type)
|
||||
if audio.size == 0:
|
||||
await _send(ws, "stt_response", {"requestId": request_id, "error": "ffmpeg-failed"})
|
||||
return
|
||||
duration_s = len(audio) / 16000.0
|
||||
logger.info("STT-Request: %.1fs Audio, model=%s, lang=%s", duration_s, runner.model_size, language)
|
||||
|
||||
t_stt = time.time()
|
||||
text, detected_duration = await runner.transcribe(audio, language)
|
||||
stt_ms = int((time.time() - t_stt) * 1000)
|
||||
|
||||
logger.info("STT-Ergebnis (%dms): '%s'", stt_ms, text[:100])
|
||||
|
||||
await _send(ws, "stt_response", {
|
||||
"requestId": request_id,
|
||||
"text": text.strip(),
|
||||
"durationS": duration_s,
|
||||
"sttMs": stt_ms,
|
||||
"loadMs": load_ms,
|
||||
"model": runner.model_size,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.exception("STT-Request fehlgeschlagen")
|
||||
await _send(ws, "stt_response", {
|
||||
"requestId": request_id,
|
||||
"error": str(e)[:200],
|
||||
})
|
||||
|
||||
|
||||
async def _broadcast_status(ws, state: str, **extra) -> None:
|
||||
"""Sendet service_status fuer das Whisper-Modul.
|
||||
state: 'loading' | 'ready' | 'error'."""
|
||||
payload = {"service": "whisper", "state": state}
|
||||
payload.update(extra)
|
||||
await _send(ws, "service_status", payload)
|
||||
|
||||
|
||||
async def run_loop(runner: WhisperRunner) -> None:
|
||||
use_tls = RVS_TLS
|
||||
retry_s = 2
|
||||
tls_fallback_tried = False
|
||||
|
||||
while True:
|
||||
scheme = "wss" if use_tls else "ws"
|
||||
url = f"{scheme}://{RVS_HOST}:{RVS_PORT}/ws?token={RVS_TOKEN}"
|
||||
masked = url.replace(RVS_TOKEN, "***") if RVS_TOKEN else url
|
||||
try:
|
||||
logger.info("Verbinde zu RVS: %s", masked)
|
||||
# max_size 50MB damit grosse stt_request (Voice-Cloning-WAVs als
|
||||
# base64 koennen mehrere MB werden) nicht das Frame-Limit sprengen
|
||||
# und die Verbindung mit 1009 'message too big' killen.
|
||||
async with websockets.connect(url, ping_interval=20, ping_timeout=10, max_size=50 * 1024 * 1024) as ws:
|
||||
logger.info("RVS verbunden")
|
||||
retry_s = 2
|
||||
tls_fallback_tried = False
|
||||
|
||||
# Initialer Status-Broadcast — uebertont alten "ready"-State
|
||||
# im App/Diagnostic Banner (sonst denkt der User noch alles ist
|
||||
# gut von vorher). Wenn Modell schon geladen → ready, sonst
|
||||
# loading mit aktuellem (Default-)Namen.
|
||||
# Plus: config_request an aria-bridge — wir wissen nicht ob
|
||||
# sie auch grad reconnected hat oder schon laenger online ist.
|
||||
async def _initial_handshake():
|
||||
try:
|
||||
if runner.model is not None:
|
||||
logger.info("Initial: broadcaste ready (Modell schon im RAM: %s)", runner.model_size)
|
||||
await _broadcast_status(ws, "ready", model=runner.model_size)
|
||||
else:
|
||||
init_model = runner.model_size or WHISPER_MODEL
|
||||
logger.info("Initial: broadcaste loading (model=%s)", init_model)
|
||||
await _broadcast_status(ws, "loading", model=init_model)
|
||||
logger.info("Initial: sende config_request an aria-bridge")
|
||||
await _send(ws, "config_request", {"service": "whisper"})
|
||||
except Exception as e:
|
||||
logger.exception("Initial-Handshake crashed: %s", e)
|
||||
asyncio.create_task(_initial_handshake())
|
||||
|
||||
async for raw in ws:
|
||||
try:
|
||||
msg = json.loads(raw)
|
||||
except Exception:
|
||||
continue
|
||||
mtype = msg.get("type", "")
|
||||
payload = msg.get("payload", {}) or {}
|
||||
|
||||
if mtype == "stt_request":
|
||||
req_id = payload.get("requestId", "?")
|
||||
audio_len = len(payload.get("audio", ""))
|
||||
logger.info("stt_request empfangen (id=%s, %dKB Audio)",
|
||||
req_id[:8] if req_id != "?" else "?", audio_len // 1365)
|
||||
asyncio.create_task(handle_stt_request(ws, payload, runner))
|
||||
elif mtype == "config":
|
||||
new_model = payload.get("whisperModel") or WHISPER_MODEL
|
||||
# Laden wenn (a) noch nix geladen, oder (b) Modell wechselt
|
||||
needs_load = (runner.model is None) or (new_model != runner.model_size)
|
||||
if needs_load:
|
||||
logger.info("Config-Broadcast: Whisper-Modell -> %s%s",
|
||||
new_model,
|
||||
" (initial)" if runner.model is None else " (Wechsel)")
|
||||
async def _swap_with_status(target):
|
||||
await _broadcast_status(ws, "loading", model=target)
|
||||
try:
|
||||
t0 = time.time()
|
||||
await runner.ensure_loaded(target)
|
||||
elapsed = time.time() - t0
|
||||
await _broadcast_status(ws, "ready",
|
||||
model=runner.model_size,
|
||||
loadSeconds=elapsed)
|
||||
except Exception as e:
|
||||
await _broadcast_status(ws, "error", error=str(e)[:200])
|
||||
asyncio.create_task(_swap_with_status(new_model))
|
||||
else:
|
||||
# Alle anderen Nachrichten debug-loggen — hilft beim Diagnostizieren,
|
||||
# ob stt_request ueberhaupt durch den RVS kommt
|
||||
logger.debug("Unbeachteter Type: %s", mtype)
|
||||
except Exception as e:
|
||||
logger.warning("Verbindung verloren: %s", e)
|
||||
if use_tls and RVS_TLS_FALLBACK and not tls_fallback_tried:
|
||||
logger.info("TLS-Verbindung fehlgeschlagen — Fallback auf ws://")
|
||||
use_tls = False
|
||||
tls_fallback_tried = True
|
||||
continue
|
||||
await asyncio.sleep(min(retry_s, 30))
|
||||
retry_s = min(retry_s * 2, 30)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
if not RVS_HOST:
|
||||
logger.error("RVS_HOST ist nicht gesetzt — Abbruch")
|
||||
sys.exit(1)
|
||||
runner = WhisperRunner()
|
||||
await run_loop(runner)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
@@ -0,0 +1,4 @@
|
||||
faster-whisper==1.0.3
|
||||
websockets>=12.0
|
||||
numpy>=1.24
|
||||
requests>=2.31
|
||||
Reference in New Issue
Block a user