Compare commits

..

No commits in common. "0fc11e33c8241912d8066fd00955cf83c46310a3" and "2264f4e3bc8b05c1bb61d2d80856548e41649056" have entirely different histories.

10 changed files with 26 additions and 442 deletions

View File

@ -1,90 +0,0 @@
/**
* MessageText rendert Chat-Text mit Auto-Linkifizierung:
* - http(s)://... → tippbar, oeffnet im Browser
* - mailto: oder plain E-Mail tippbar, oeffnet Mail-App
* - Telefonnummern tippbar, oeffnet Android-Dialer
*
* Text ist durchgaengig markierbar/kopierbar (selectable).
*/
import React from 'react';
import { Text, Linking, TextStyle, StyleProp } from 'react-native';
// Regex kombiniert URL | Email | Telefonnummer.
// Gruppenreihenfolge ist wichtig fuer die Erkennung unten.
//
// URL: http://... oder https://... bis zum ersten Whitespace / Anfuehrungszeichen.
// Email: simpler Standard-Match (kein RFC-kompatibel aber gut genug).
// Telefon: internationale Form (+49..., 0049..., 0176...), darf Leerzeichen
// / Bindestriche / Schraegstriche / Klammern enthalten, mindestens 7
// Ziffern insgesamt. Vermeidet banale Zahlen (Uhrzeiten, Datum).
const LINK_REGEX = new RegExp(
'(https?:\\/\\/[^\\s<>"]+)' + // 1: URL
'|([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,})' + // 2: Email
'|((?:\\+|00)\\d[\\d\\s()\\-\\/]{6,}\\d|0\\d{2,4}[\\s\\/\\-]?[\\d\\s\\-\\/]{5,}\\d)', // 3: Telefon
'g',
);
const LINK_STYLE = { color: '#0096FF', textDecorationLine: 'underline' } as TextStyle;
interface Segment {
text: string;
kind: 'text' | 'url' | 'email' | 'phone';
}
function tokenize(raw: string): Segment[] {
const out: Segment[] = [];
let lastEnd = 0;
LINK_REGEX.lastIndex = 0;
let m: RegExpExecArray | null;
while ((m = LINK_REGEX.exec(raw)) !== null) {
if (m.index > lastEnd) {
out.push({ text: raw.slice(lastEnd, m.index), kind: 'text' });
}
if (m[1]) out.push({ text: m[1], kind: 'url' });
else if (m[2]) out.push({ text: m[2], kind: 'email' });
else if (m[3]) out.push({ text: m[3], kind: 'phone' });
lastEnd = LINK_REGEX.lastIndex;
}
if (lastEnd < raw.length) out.push({ text: raw.slice(lastEnd), kind: 'text' });
return out;
}
function onPress(seg: Segment) {
try {
if (seg.kind === 'url') {
Linking.openURL(seg.text);
} else if (seg.kind === 'email') {
Linking.openURL(`mailto:${seg.text}`);
} else if (seg.kind === 'phone') {
// Android-Dialer erwartet tel:-Schema ohne Leerzeichen/Bindestriche
const clean = seg.text.replace(/[\s\-\/()]/g, '');
Linking.openURL(`tel:${clean}`);
}
} catch {}
}
interface Props {
text: string;
style?: StyleProp<TextStyle>;
}
const MessageText: React.FC<Props> = ({ text, style }) => {
const segments = React.useMemo(() => tokenize(text), [text]);
return (
<Text style={style} selectable>
{segments.map((seg, i) => {
if (seg.kind === 'text') {
return <Text key={i}>{seg.text}</Text>;
}
return (
<Text key={i} style={LINK_STYLE} onPress={() => onPress(seg)}>
{seg.text}
</Text>
);
})}
</Text>
);
};
export default MessageText;

View File

@ -29,8 +29,7 @@ import updateService from '../services/updater';
import VoiceButton from '../components/VoiceButton'; import VoiceButton from '../components/VoiceButton';
import FileUpload, { FileData } from '../components/FileUpload'; import FileUpload, { FileData } from '../components/FileUpload';
import CameraUpload, { PhotoData } from '../components/CameraUpload'; import CameraUpload, { PhotoData } from '../components/CameraUpload';
import MessageText from '../components/MessageText'; import { RecordingResult, loadConvWindowMs } from '../services/audio';
import { RecordingResult, loadConvWindowMs, loadTtsSpeed, TTS_SPEED_DEFAULT } from '../services/audio';
import Geolocation from '@react-native-community/geolocation'; import Geolocation from '@react-native-community/geolocation';
// --- Typen --- // --- Typen ---
@ -117,13 +116,6 @@ const ChatScreen: React.FC = () => {
const [ttsMuted, setTtsMuted] = useState(false); const [ttsMuted, setTtsMuted] = useState(false);
// Gerätelokale XTTS-Voice-Wahl (bevorzugt gegenueber dem globalen Default) // Gerätelokale XTTS-Voice-Wahl (bevorzugt gegenueber dem globalen Default)
const localXttsVoiceRef = useRef<string>(''); const localXttsVoiceRef = useRef<string>('');
// Geraetelokale TTS-Wiedergabegeschwindigkeit (speed-Param an F5-TTS)
const ttsSpeedRef = useRef<number>(TTS_SPEED_DEFAULT);
// Spiegelung der TTS-Settings in einer Ref — damit die onMessage-Closure
// (useEffect mit []-deps) IMMER die aktuellen Werte sieht. Ohne Ref
// bliebe canPlay auf dem Mount-Initial-Wert haengen (mute ignoriert,
// oder AsyncStorage-Load nicht beruecksichtigt).
const ttsCanPlayRef = useRef<boolean>(true);
const flatListRef = useRef<FlatList>(null); const flatListRef = useRef<FlatList>(null);
const messageIdCounter = useRef(0); const messageIdCounter = useRef(0);
@ -143,7 +135,6 @@ const ChatScreen: React.FC = () => {
setTtsMuted(muted === 'true'); // default false setTtsMuted(muted === 'true'); // default false
const voice = await AsyncStorage.getItem('aria_xtts_voice'); const voice = await AsyncStorage.getItem('aria_xtts_voice');
localXttsVoiceRef.current = voice || ''; localXttsVoiceRef.current = voice || '';
ttsSpeedRef.current = await loadTtsSpeed();
}; };
loadTtsSettings(); loadTtsSettings();
// Poll alle 2s um Settings-Aenderung mitzubekommen (einfache Loesung ohne Context) // Poll alle 2s um Settings-Aenderung mitzubekommen (einfache Loesung ohne Context)
@ -156,12 +147,6 @@ const ChatScreen: React.FC = () => {
wakeWordService.loadFromStorage().catch(() => {}); wakeWordService.loadFromStorage().catch(() => {});
}, []); }, []);
// ttsCanPlayRef live aktuell halten — Closure in onMessage unten liest
// darueber statt direkt ttsDeviceEnabled/ttsMuted (sonst stale).
useEffect(() => {
ttsCanPlayRef.current = ttsDeviceEnabled && !ttsMuted;
}, [ttsDeviceEnabled, ttsMuted]);
const toggleMute = useCallback(() => { const toggleMute = useCallback(() => {
setTtsMuted(prev => { setTtsMuted(prev => {
const next = !prev; const next = !prev;
@ -314,8 +299,7 @@ const ChatScreen: React.FC = () => {
} }
// TTS-Audio abspielen wenn vorhanden — respektiert geraetelokalen Mute/Disable // TTS-Audio abspielen wenn vorhanden — respektiert geraetelokalen Mute/Disable
// WICHTIG: via Ref statt direkt state lesen, sonst ist's stale (Closure-Bug). const canPlay = ttsDeviceEnabled && !ttsMuted;
const canPlay = ttsCanPlayRef.current;
if (message.type === 'audio' && message.payload.base64) { if (message.type === 'audio' && message.payload.base64) {
const b64 = message.payload.base64 as string; const b64 = message.payload.base64 as string;
const refId = (message.payload.messageId as string) || ''; const refId = (message.payload.messageId as string) || '';
@ -455,7 +439,6 @@ const ChatScreen: React.FC = () => {
durationMs: result.durationMs, durationMs: result.durationMs,
mimeType: result.mimeType, mimeType: result.mimeType,
voice: localXttsVoiceRef.current, voice: localXttsVoiceRef.current,
speed: ttsSpeedRef.current,
...(location && { location }), ...(location && { location }),
}); });
// resume() wird durch onPlaybackFinished nach ARIAs Antwort getriggert. // resume() wird durch onPlaybackFinished nach ARIAs Antwort getriggert.
@ -477,12 +460,7 @@ const ChatScreen: React.FC = () => {
// Wake Word Toggle Handler // Wake Word Toggle Handler
const toggleWakeWord = useCallback(async () => { const toggleWakeWord = useCallback(async () => {
if (wakeWordActive) { if (wakeWordActive) {
// Vor Porcupine-Stop: eventuelle laufende Aufnahme abbrechen. Sonst wakeWordService.stop();
// bleibt audioService.recordingState=='recording' haengen und der
// normale Aufnahme-Button wirkt nicht mehr (startRecording lehnt
// ab weil "Aufnahme laeuft bereits").
try { await audioService.stopRecording(); } catch {}
await wakeWordService.stop();
setWakeWordActive(false); setWakeWordActive(false);
} else { } else {
const started = await wakeWordService.start(); const started = await wakeWordService.start();
@ -572,7 +550,6 @@ const ChatScreen: React.FC = () => {
rvs.send('chat', { rvs.send('chat', {
text, text,
voice: localXttsVoiceRef.current, voice: localXttsVoiceRef.current,
speed: ttsSpeedRef.current,
...(location && { location }), ...(location && { location }),
}); });
}, [inputText, getCurrentLocation, pendingAttachments, sendPendingAttachments]); }, [inputText, getCurrentLocation, pendingAttachments, sendPendingAttachments]);
@ -682,7 +659,6 @@ const ChatScreen: React.FC = () => {
rvs.send('chat', { rvs.send('chat', {
text: messageText, text: messageText,
voice: localXttsVoiceRef.current, voice: localXttsVoiceRef.current,
speed: ttsSpeedRef.current,
...(location && { location }), ...(location && { location }),
}); });
} }
@ -757,10 +733,9 @@ const ChatScreen: React.FC = () => {
))} ))}
{/* Text (nicht anzeigen wenn nur "Anhang empfangen" und ein Bild da ist) */} {/* Text (nicht anzeigen wenn nur "Anhang empfangen" und ein Bild da ist) */}
{!(item.text === 'Anhang empfangen' && item.attachments?.some(a => a.type === 'image' && a.uri)) && ( {!(item.text === 'Anhang empfangen' && item.attachments?.some(a => a.type === 'image' && a.uri)) && (
<MessageText <Text style={[styles.messageText, isUser ? styles.userText : styles.ariaText]}>
text={item.text} {item.text}
style={[styles.messageText, isUser ? styles.userText : styles.ariaText]} </Text>
/>
)} )}
{/* Play-Button fuer ARIA-Nachrichten — Cache bevorzugt, sonst Bridge-TTS mit aktueller Engine */} {/* Play-Button fuer ARIA-Nachrichten — Cache bevorzugt, sonst Bridge-TTS mit aktueller Engine */}
{!isUser && item.text.length > 0 && ( {!isUser && item.text.length > 0 && (
@ -775,7 +750,6 @@ const ChatScreen: React.FC = () => {
rvs.send('tts_request' as any, { rvs.send('tts_request' as any, {
text: item.text, text: item.text,
voice: localXttsVoiceRef.current, voice: localXttsVoiceRef.current,
speed: ttsSpeedRef.current,
messageId: item.messageId || '', messageId: item.messageId || '',
}); });
} }

View File

@ -35,10 +35,6 @@ import {
CONV_WINDOW_MIN_SEC, CONV_WINDOW_MIN_SEC,
CONV_WINDOW_MAX_SEC, CONV_WINDOW_MAX_SEC,
CONV_WINDOW_STORAGE_KEY, CONV_WINDOW_STORAGE_KEY,
TTS_SPEED_DEFAULT,
TTS_SPEED_MIN,
TTS_SPEED_MAX,
TTS_SPEED_STORAGE_KEY,
} from '../services/audio'; } from '../services/audio';
import wakeWordService, { import wakeWordService, {
BUILTIN_KEYWORDS, BUILTIN_KEYWORDS,
@ -102,7 +98,6 @@ const SettingsScreen: React.FC = () => {
const [ttsPrerollSec, setTtsPrerollSec] = useState<number>(TTS_PREROLL_DEFAULT_SEC); const [ttsPrerollSec, setTtsPrerollSec] = useState<number>(TTS_PREROLL_DEFAULT_SEC);
const [vadSilenceSec, setVadSilenceSec] = useState<number>(VAD_SILENCE_DEFAULT_SEC); const [vadSilenceSec, setVadSilenceSec] = useState<number>(VAD_SILENCE_DEFAULT_SEC);
const [convWindowSec, setConvWindowSec] = useState<number>(CONV_WINDOW_DEFAULT_SEC); const [convWindowSec, setConvWindowSec] = useState<number>(CONV_WINDOW_DEFAULT_SEC);
const [ttsSpeed, setTtsSpeed] = useState<number>(TTS_SPEED_DEFAULT);
const [wakeAccessKey, setWakeAccessKey] = useState<string>(''); const [wakeAccessKey, setWakeAccessKey] = useState<string>('');
const [wakeAccessKeyVisible, setWakeAccessKeyVisible] = useState(false); const [wakeAccessKeyVisible, setWakeAccessKeyVisible] = useState(false);
const [wakeKeyword, setWakeKeyword] = useState<string>(DEFAULT_KEYWORD); const [wakeKeyword, setWakeKeyword] = useState<string>(DEFAULT_KEYWORD);
@ -158,12 +153,6 @@ const SettingsScreen: React.FC = () => {
} }
} }
}); });
AsyncStorage.getItem(TTS_SPEED_STORAGE_KEY).then(saved => {
if (saved != null) {
const n = parseFloat(saved);
if (isFinite(n) && n >= TTS_SPEED_MIN && n <= TTS_SPEED_MAX) setTtsSpeed(n);
}
});
AsyncStorage.getItem(WAKE_ACCESS_KEY_STORAGE).then(saved => { AsyncStorage.getItem(WAKE_ACCESS_KEY_STORAGE).then(saved => {
if (saved) setWakeAccessKey(saved); if (saved) setWakeAccessKey(saved);
}); });
@ -811,38 +800,6 @@ const SettingsScreen: React.FC = () => {
<Text style={styles.prerollButtonText}>+0.5</Text> <Text style={styles.prerollButtonText}>+0.5</Text>
</TouchableOpacity> </TouchableOpacity>
</View> </View>
<Text style={[styles.toggleLabel, {marginTop: 24}]}>Sprechgeschwindigkeit</Text>
<Text style={styles.toggleHint}>
Wie schnell ARIA spricht. 1.0 = Normal. Niedriger = langsamer, hoeher = schneller.
Wird an F5-TTS als speed-Param uebergeben und pro Geraet gespeichert.
Default: {TTS_SPEED_DEFAULT.toFixed(1)}x.
</Text>
<View style={styles.prerollRow}>
<TouchableOpacity
style={styles.prerollButton}
onPress={() => {
const next = Math.max(TTS_SPEED_MIN, Math.round((ttsSpeed - 0.1) * 10) / 10);
setTtsSpeed(next);
AsyncStorage.setItem(TTS_SPEED_STORAGE_KEY, String(next));
}}
disabled={ttsSpeed <= TTS_SPEED_MIN}
>
<Text style={styles.prerollButtonText}>0.1</Text>
</TouchableOpacity>
<Text style={styles.prerollValue}>{ttsSpeed.toFixed(1)} x</Text>
<TouchableOpacity
style={styles.prerollButton}
onPress={() => {
const next = Math.min(TTS_SPEED_MAX, Math.round((ttsSpeed + 0.1) * 10) / 10);
setTtsSpeed(next);
AsyncStorage.setItem(TTS_SPEED_STORAGE_KEY, String(next));
}}
disabled={ttsSpeed >= TTS_SPEED_MAX}
>
<Text style={styles.prerollButtonText}>+0.1</Text>
</TouchableOpacity>
</View>
</View> </View>
)} )}

View File

@ -92,24 +92,6 @@ export const CONV_WINDOW_MIN_SEC = 3.0;
export const CONV_WINDOW_MAX_SEC = 20.0; export const CONV_WINDOW_MAX_SEC = 20.0;
export const CONV_WINDOW_STORAGE_KEY = 'aria_conv_window_sec'; export const CONV_WINDOW_STORAGE_KEY = 'aria_conv_window_sec';
// TTS-Wiedergabegeschwindigkeit — wird pro Geraet gespeichert und an die
// Bridge mitgegeben (speed-Param im F5-TTS infer()). 1.0 = normal.
export const TTS_SPEED_DEFAULT = 1.0;
export const TTS_SPEED_MIN = 0.5;
export const TTS_SPEED_MAX = 2.0;
export const TTS_SPEED_STORAGE_KEY = 'aria_tts_speed';
export async function loadTtsSpeed(): Promise<number> {
try {
const raw = await AsyncStorage.getItem(TTS_SPEED_STORAGE_KEY);
if (raw != null) {
const n = parseFloat(raw);
if (isFinite(n) && n >= TTS_SPEED_MIN && n <= TTS_SPEED_MAX) return n;
}
} catch {}
return TTS_SPEED_DEFAULT;
}
export async function loadConvWindowMs(): Promise<number> { export async function loadConvWindowMs(): Promise<number> {
try { try {
const raw = await AsyncStorage.getItem(CONV_WINDOW_STORAGE_KEY); const raw = await AsyncStorage.getItem(CONV_WINDOW_STORAGE_KEY);

View File

@ -90,32 +90,12 @@ class WakeWordService {
if (this.initInProgress) return this.initInProgress; if (this.initInProgress) return this.initInProgress;
this.initInProgress = (async () => { this.initInProgress = (async () => {
try { try {
const porcupineRN = require('@picovoice/porcupine-react-native'); const { PorcupineManager } = require('@picovoice/porcupine-react-native');
const { PorcupineManager, BuiltInKeywords } = porcupineRN; // Built-In Keyword-Identifier sind lower-case strings im SDK
// Manche Porcupine-Versionen wollen das BuiltInKeywords-Enum (Objekt
// mit keys wie JARVIS, COMPUTER, HEY_GOOGLE), andere akzeptieren
// den String direkt. Mappen mit Fallback auf String:
const enumKey = this.keyword.toUpperCase().replace(/\s+/g, '_');
const kw = (BuiltInKeywords && BuiltInKeywords[enumKey]) || this.keyword;
console.log('[WakeWord] Porcupine init: keyword=%s (resolved=%s)',
this.keyword, typeof kw === 'string' ? kw : '[enum]');
this.porcupine = await PorcupineManager.fromBuiltInKeywords( this.porcupine = await PorcupineManager.fromBuiltInKeywords(
this.accessKey, this.accessKey,
[kw], [this.keyword],
(keywordIndex: number) => { (_keywordIndex: number) => this.onWakeDetected(),
console.log('[WakeWord] Porcupine callback fired (index=%d)', keywordIndex);
this.onWakeDetected().catch(err =>
console.warn('[WakeWord] onWakeDetected crashed:', err));
},
// Error handler (wenn Porcupine im Background-Thread crashed,
// z.B. beim Audio-Engine-Konflikt mit audio-recorder-player)
(error: any) => {
console.warn('[WakeWord] Porcupine runtime error:', error?.message || error);
// Nicht in Loop crashen — state zurueck auf off damit der User
// mit dem Aufnahme-Button wieder normal arbeiten kann
this.setState('off');
this.disposePorcupine().catch(() => {});
},
); );
console.log('[WakeWord] Porcupine init OK (keyword=%s)', this.keyword); console.log('[WakeWord] Porcupine init OK (keyword=%s)', this.keyword);
return true; return true;

View File

@ -541,9 +541,6 @@ class ARIABridge:
# Wird fuer die direkt folgende ARIA-Antwort genutzt und dann zurueckgesetzt. # Wird fuer die direkt folgende ARIA-Antwort genutzt und dann zurueckgesetzt.
# So kann jedes Geraet seine bevorzugte Stimme bekommen (pro Request). # So kann jedes Geraet seine bevorzugte Stimme bekommen (pro Request).
self._next_voice_override: Optional[str] = None self._next_voice_override: Optional[str] = None
# Gleiche Logik fuer die Wiedergabegeschwindigkeit (F5-TTS speed-Param,
# App-Setting aria_tts_speed, 1.0 = normal).
self._next_speed_override: Optional[float] = None
# STT-Requests die aktuell auf Antwort von der whisper-bridge (Gamebox) warten. # STT-Requests die aktuell auf Antwort von der whisper-bridge (Gamebox) warten.
# requestId → Future mit dem Text (oder None bei Fehler). # requestId → Future mit dem Text (oder None bei Fehler).
self._pending_stt: dict[str, asyncio.Future] = {} self._pending_stt: dict[str, asyncio.Future] = {}
@ -914,12 +911,6 @@ class ARIABridge:
logger.info("[core] Nutze Voice-Override: %s", self._next_voice_override) logger.info("[core] Nutze Voice-Override: %s", self._next_voice_override)
self._next_voice_override = None self._next_voice_override = None
# Speed ebenfalls aus App-Override nehmen (fallback 1.0)
xtts_speed = self._next_speed_override or 1.0
if self._next_speed_override:
logger.info("[core] Nutze Speed-Override: %.2fx", self._next_speed_override)
self._next_speed_override = None
tts_text = tts_text_preview or text tts_text = tts_text_preview or text
if not tts_text: if not tts_text:
logger.info("[core] TTS-Text leer nach Cleanup — uebersprungen") logger.info("[core] TTS-Text leer nach Cleanup — uebersprungen")
@ -935,7 +926,6 @@ class ARIABridge:
"payload": { "payload": {
"text": tts_text, "text": tts_text,
"voice": xtts_voice, "voice": xtts_voice,
"speed": xtts_speed,
"language": "de", "language": "de",
"requestId": xtts_request_id, "requestId": xtts_request_id,
"messageId": message_id, "messageId": message_id,
@ -1173,13 +1163,6 @@ class ARIABridge:
if voice_override: if voice_override:
self._next_voice_override = voice_override self._next_voice_override = voice_override
logger.info("[rvs] Voice-Override fuer naechste Antwort: %s", voice_override) logger.info("[rvs] Voice-Override fuer naechste Antwort: %s", voice_override)
# Speed-Override (TTS-Wiedergabegeschwindigkeit, pro Geraet)
try:
speed = float(payload.get("speed", 0) or 0)
if 0.25 <= speed <= 4.0:
self._next_speed_override = speed
except (TypeError, ValueError):
pass
if text: if text:
logger.info("[rvs] App-Chat: '%s'", text[:80]) logger.info("[rvs] App-Chat: '%s'", text[:80])
await self.send_to_core(text, source="app") await self.send_to_core(text, source="app")
@ -1232,14 +1215,8 @@ class ARIABridge:
if not text: if not text:
return return
tts_text = clean_text_for_tts(text) or text tts_text = clean_text_for_tts(text) or text
# Voice + Speed aus App-Payload gewinnen, sonst global/default # Voice aus App-Payload gewinnt, sonst global
xtts_voice = payload.get("voice", "") or getattr(self, 'xtts_voice', '') xtts_voice = payload.get("voice", "") or getattr(self, 'xtts_voice', '')
try:
xtts_speed = float(payload.get("speed", 0) or 0)
if not (0.25 <= xtts_speed <= 4.0):
xtts_speed = 1.0
except (TypeError, ValueError):
xtts_speed = 1.0
try: try:
xtts_request_id = str(uuid.uuid4()) xtts_request_id = str(uuid.uuid4())
if message_id: if message_id:
@ -1249,7 +1226,6 @@ class ARIABridge:
"payload": { "payload": {
"text": tts_text, "text": tts_text,
"voice": xtts_voice, "voice": xtts_voice,
"speed": xtts_speed,
"language": "de", "language": "de",
"requestId": xtts_request_id, "requestId": xtts_request_id,
"messageId": message_id, "messageId": message_id,
@ -1448,12 +1424,6 @@ class ARIABridge:
if voice_override: if voice_override:
self._next_voice_override = voice_override self._next_voice_override = voice_override
logger.info("[rvs] Voice-Override (via Audio): %s", voice_override) logger.info("[rvs] Voice-Override (via Audio): %s", voice_override)
try:
speed = float(payload.get("speed", 0) or 0)
if 0.25 <= speed <= 4.0:
self._next_speed_override = speed
except (TypeError, ValueError):
pass
logger.info("[rvs] Audio empfangen: %s, %dms, %dKB", logger.info("[rvs] Audio empfangen: %s, %dms, %dKB",
mime_type, duration_ms, len(audio_b64) // 1365) mime_type, duration_ms, len(audio_b64) // 1365)
asyncio.create_task(self._process_app_audio(audio_b64, mime_type)) asyncio.create_task(self._process_app_audio(audio_b64, mime_type))

View File

@ -136,25 +136,6 @@
</div> </div>
</div> </div>
<!-- Voice-Preview Modal -->
<div id="voice-preview-modal" style="display:none;position:fixed;inset:0;z-index:1000;background:rgba(0,0,0,0.7);align-items:center;justify-content:center;">
<div style="background:#1A1A2E;border:1px solid #2A2A3E;border-radius:10px;padding:20px;max-width:560px;width:90%;display:flex;flex-direction:column;gap:12px;">
<div style="display:flex;align-items:center;justify-content:space-between;">
<h3 style="margin:0;color:#fff;">Stimmen-Preview: <span id="voice-preview-name"></span></h3>
<button onclick="closeVoicePreview()" style="background:none;border:none;color:#8888AA;font-size:22px;cursor:pointer;">&times;</button>
</div>
<textarea id="voice-preview-text" rows="4"
style="background:#0D0D1A;border:1px solid #2A2A3E;border-radius:6px;padding:10px;color:#fff;font-size:13px;resize:vertical;"></textarea>
<div style="display:flex;gap:8px;align-items:center;">
<button id="voice-preview-play" onclick="playVoicePreview()" class="btn primary" style="padding:8px 16px;">
▶ Abspielen
</button>
<span id="voice-preview-status" style="color:#8888AA;font-size:11px;flex:1;"></span>
</div>
<audio id="voice-preview-audio" controls style="width:100%;display:none;"></audio>
</div>
</div>
<!-- Disk-Space Warnung (dynamisch gesetzt) --> <!-- Disk-Space Warnung (dynamisch gesetzt) -->
<div id="disk-banner" style="display:none;position:sticky;top:0;z-index:500;padding:10px 14px;border-radius:0;margin:-16px -16px 12px -16px;font-size:13px;"> <div id="disk-banner" style="display:none;position:sticky;top:0;z-index:500;padding:10px 14px;border-radius:0;margin:-16px -16px 12px -16px;font-size:13px;">
<div style="display:flex;align-items:center;gap:10px;flex-wrap:wrap;"> <div style="display:flex;align-items:center;gap:10px;flex-wrap:wrap;">
@ -949,24 +930,6 @@
return; return;
} }
if (msg.type === 'voice_preview_audio') {
const statusEl = document.getElementById('voice-preview-status');
const audio = document.getElementById('voice-preview-audio');
const playBtn = document.getElementById('voice-preview-play');
if (playBtn) playBtn.disabled = false;
if (msg.error) {
if (statusEl) statusEl.textContent = '❌ Fehler: ' + msg.error;
return;
}
if (msg.base64 && audio) {
audio.src = 'data:audio/wav;base64,' + msg.base64;
audio.style.display = 'block';
audio.play().catch(() => {});
if (statusEl) statusEl.textContent = '✅ fertig';
}
return;
}
if (msg.type === 'voice_ready') { if (msg.type === 'voice_ready') {
const v = msg.payload?.voice || ''; const v = msg.payload?.voice || '';
const err = msg.payload?.error; const err = msg.payload?.error;
@ -1616,51 +1579,16 @@
html += '<div style="display:flex;flex-direction:column;gap:4px;">'; html += '<div style="display:flex;flex-direction:column;gap:4px;">';
for (const v of voices) { for (const v of voices) {
const esc = (s) => String(s).replace(/[&<>"']/g, c => ({ "&":"&amp;", "<":"&lt;", ">":"&gt;", '"':"&quot;", "'":"&#39;" }[c])); const esc = (s) => String(s).replace(/[&<>"']/g, c => ({ "&":"&amp;", "<":"&lt;", ">":"&gt;", '"':"&quot;", "'":"&#39;" }[c]));
const jsName = esc(v.name).replace(/'/g, "\\'");
html += `<div style="display:flex;align-items:center;gap:8px;background:#1E1E2E;border-radius:4px;padding:4px 8px;font-size:12px;">` html += `<div style="display:flex;align-items:center;gap:8px;background:#1E1E2E;border-radius:4px;padding:4px 8px;font-size:12px;">`
+ `<span style="flex:1;color:#E0E0F0;">${esc(v.name)}</span>` + `<span style="flex:1;color:#E0E0F0;">${esc(v.name)}</span>`
+ `<span style="color:#555570;font-size:10px;">${(v.size/1024).toFixed(0)}KB</span>` + `<span style="color:#555570;font-size:10px;">${(v.size/1024).toFixed(0)}KB</span>`
+ `<button class="btn secondary" onclick="openVoicePreview('${jsName}')" style="padding:2px 8px;font-size:12px;" title="Stimme anhoeren"></button>` + `<button class="btn secondary" onclick="deleteXttsVoice('${esc(v.name).replace(/'/g, "\\'")}')" style="padding:2px 8px;font-size:10px;color:#FF6B6B;" title="Stimme loeschen">X</button>`
+ `<button class="btn secondary" onclick="deleteXttsVoice('${jsName}')" style="padding:2px 8px;font-size:10px;color:#FF6B6B;" title="Stimme loeschen">X</button>`
+ `</div>`; + `</div>`;
} }
html += '</div>'; html += '</div>';
box.innerHTML = html; box.innerHTML = html;
} }
// ── Voice Preview Modal ─────────────────────────
const VOICE_PREVIEW_DEFAULT = 'Hallo, ich bin ARIA. Das hier ist ein kleiner Test damit du meine Stimme beurteilen kannst.';
let currentPreviewVoice = '';
function openVoicePreview(name) {
currentPreviewVoice = name;
document.getElementById('voice-preview-name').textContent = name;
// Text bei jedem Oeffnen zuruecksetzen
document.getElementById('voice-preview-text').value = VOICE_PREVIEW_DEFAULT;
document.getElementById('voice-preview-status').textContent = '';
const audio = document.getElementById('voice-preview-audio');
audio.style.display = 'none';
audio.src = '';
document.getElementById('voice-preview-modal').style.display = 'flex';
}
function closeVoicePreview() {
document.getElementById('voice-preview-modal').style.display = 'none';
const audio = document.getElementById('voice-preview-audio');
try { audio.pause(); } catch {}
}
function playVoicePreview() {
const text = (document.getElementById('voice-preview-text').value || '').trim();
if (!text) {
document.getElementById('voice-preview-status').textContent = 'Text leer';
return;
}
document.getElementById('voice-preview-status').textContent = '⏳ Rendere...';
document.getElementById('voice-preview-play').disabled = true;
send({ action: 'preview_voice', voice: currentPreviewVoice, text });
}
function deleteXttsVoice(name) { function deleteXttsVoice(name) {
if (!confirm(`Stimme "${name}" endgueltig loeschen?`)) return; if (!confirm(`Stimme "${name}" endgueltig loeschen?`)) return;
send({ action: 'xtts_delete_voice', name }); send({ action: 'xtts_delete_voice', name });

View File

@ -653,9 +653,6 @@ function connectRVS(forcePlain) {
log("info", "rvs", `service_status ${svc} ${state}${model ? ` (${model})` : ""}`); log("info", "rvs", `service_status ${svc} ${state}${model ? ` (${model})` : ""}`);
} }
broadcast({ type: "service_status", payload: msg.payload }); broadcast({ type: "service_status", payload: msg.payload });
} else if (msg.type === "audio_pcm" && msg.payload && _previewPending.size > 0) {
// PCM-Chunks einer laufenden Voice-Preview — sammeln + WAV bauen
_handlePreviewChunk(msg.payload);
} else { } else {
log("debug", "rvs", `Nachricht: ${JSON.stringify(msg).slice(0, 150)}`); log("debug", "rvs", `Nachricht: ${JSON.stringify(msg).slice(0, 150)}`);
} }
@ -1468,8 +1465,6 @@ wss.on("connection", (ws) => {
handleSaveTriggers(ws, msg.triggers || []); handleSaveTriggers(ws, msg.triggers || []);
} else if (msg.action === "test_tts") { } else if (msg.action === "test_tts") {
handleTestTTS(ws, msg.text || "Test"); handleTestTTS(ws, msg.text || "Test");
} else if (msg.action === "preview_voice") {
handleVoicePreview(ws, msg.voice || "", msg.text || "Hallo.");
} else if (msg.action === "check_tts") { } else if (msg.action === "check_tts") {
handleCheckTTS(ws); handleCheckTTS(ws);
} else if (msg.action === "check_desktop") { } else if (msg.action === "check_desktop") {
@ -1642,95 +1637,6 @@ async function handleSaveTriggers(clientWs, triggers) {
} }
// ── TTS Diagnose (XTTS) ─────────────────────────────── // ── TTS Diagnose (XTTS) ───────────────────────────────
// ── Voice Preview ────────────────────────────────────────
// Sammelt audio_pcm Chunks einer Preview-Anfrage, baut am Ende eine WAV
// und schickt sie base64-kodiert an den Browser-Client.
//
// Map requestId → { clientWs, chunks: [Buffer], sampleRate, channels }
const _previewPending = new Map();
function _buildWavFromPcm(pcmBuf, sampleRate, channels) {
const bitsPerSample = 16;
const byteRate = sampleRate * channels * bitsPerSample / 8;
const blockAlign = channels * bitsPerSample / 8;
const dataSize = pcmBuf.length;
const header = Buffer.alloc(44);
header.write("RIFF", 0);
header.writeUInt32LE(36 + dataSize, 4);
header.write("WAVE", 8);
header.write("fmt ", 12);
header.writeUInt32LE(16, 16); // subchunk1 size
header.writeUInt16LE(1, 20); // PCM
header.writeUInt16LE(channels, 22);
header.writeUInt32LE(sampleRate, 24);
header.writeUInt32LE(byteRate, 28);
header.writeUInt16LE(blockAlign, 32);
header.writeUInt16LE(bitsPerSample, 34);
header.write("data", 36);
header.writeUInt32LE(dataSize, 40);
return Buffer.concat([header, pcmBuf]);
}
function _handlePreviewChunk(payload) {
const reqId = payload?.requestId || "";
const entry = _previewPending.get(reqId);
if (!entry) return;
if (payload.base64) {
try { entry.chunks.push(Buffer.from(payload.base64, "base64")); } catch {}
}
if (!entry.sampleRate && payload.sampleRate) entry.sampleRate = payload.sampleRate;
if (!entry.channels && payload.channels) entry.channels = payload.channels;
if (payload.final) {
_previewPending.delete(reqId);
try {
const pcm = Buffer.concat(entry.chunks);
const wav = _buildWavFromPcm(pcm, entry.sampleRate || 24000, entry.channels || 1);
const b64 = wav.toString("base64");
if (entry.clientWs && entry.clientWs.readyState === 1) {
entry.clientWs.send(JSON.stringify({
type: "voice_preview_audio",
base64: b64,
size: wav.length,
}));
}
} catch (err) {
if (entry.clientWs && entry.clientWs.readyState === 1) {
entry.clientWs.send(JSON.stringify({
type: "voice_preview_audio",
error: err.message,
}));
}
}
}
}
async function handleVoicePreview(clientWs, voice, text) {
try {
const requestId = crypto.randomUUID();
_previewPending.set(requestId, { clientWs, chunks: [], sampleRate: 0, channels: 0 });
// Timeout safety net
setTimeout(() => {
if (_previewPending.has(requestId)) {
_previewPending.delete(requestId);
if (clientWs && clientWs.readyState === 1) {
clientWs.send(JSON.stringify({
type: "voice_preview_audio",
error: "Timeout (60s) — keine Antwort vom f5tts-bridge",
}));
}
}
}, 60000);
log("info", "server", `Voice-Preview: voice="${voice}" text="${text.slice(0, 60)}"`);
sendToRVS_raw({
type: "xtts_request",
payload: { text, language: "de", requestId, voice, speed: 1.0 },
timestamp: Date.now(),
});
} catch (err) {
clientWs.send(JSON.stringify({ type: "voice_preview_audio", error: err.message }));
}
}
async function handleTestTTS(clientWs, text) { async function handleTestTTS(clientWs, text) {
try { try {
log("info", "server", `TTS-Test via XTTS: "${text}"`); log("info", "server", `TTS-Test via XTTS: "${text}"`);

View File

@ -70,34 +70,22 @@
- [x] VAD-Stille einstellbar in App-Settings (1.0-8.0s, Default 2.8s) - [x] VAD-Stille einstellbar in App-Settings (1.0-8.0s, Default 2.8s)
- [x] MAX_RECORDING auf 120s — laengere Erklaerungen moeglich - [x] MAX_RECORDING auf 120s — laengere Erklaerungen moeglich
- [x] App: Audioausgabe hoert nicht mehr mitten im Satz auf (playbackHeadPosition wait + Stop-Race fix) - [x] App: Audioausgabe hoert nicht mehr mitten im Satz auf (playbackHeadPosition wait + Stop-Race fix)
- [x] F5-TTS: Referenz-WAV-Preprocessing — Loudness-Normalisierung -16 LUFS + Silence-Trim + 10s Clip fuer konsistente Cloning-Quali
- [x] F5-TTS: deutsches Fine-Tune (aihpi/F5-TTS-German, Vocos-Variante) via hf:// Pfad in Diagnostic konfigurierbar
- [x] Whisper transkribiert Voice-Uploads nicht mehr mit hardcoded "small" — aktuelles Modell wird behalten, kein unnoetiger Modell-Swap
- [x] RVS/WebSocket maxPayload 50MB: voice_upload mit WAV als base64 sprengt kein Frame-Limit mehr
- [x] Dynamischer STT-Timeout in aria-bridge: 300s waehrend whisper-bridge 'loading', 45s wenn 'ready'
- [x] service_status Broadcasts: f5tts/whisper melden Lade-Status, Banner in Diagnostic (unten rechts) + App (oben)
- [x] config_request Pattern: Bridges fragen beim Connect die aktuelle Voice-Config an, aria-bridge antwortet
- [x] F5-TTS Tuning via Diagnostic (Modell-ID, Checkpoint, cfg_strength, nfe_step) statt ENV-Vars — Hot-Reload bei Modell-Wechsel
- [x] Conversation-Window: Gespraechsmodus endet nach X Sekunden Stille (1.0-20.0s, Default 8s, einstellbar in Settings)
- [x] Porcupine Wake-Word-Integration in der App (Built-In Keywords + Custom spaeter, per Geraet einstellbar)
- [x] HF-Cache als Bind-Mount statt Docker Volume — kein .vhdx-Bloat auf Docker Desktop / Windows
- [x] cleanup-windows.ps1 / .bat: VHDX-Cleanup via diskpart (ohne Hyper-V) mit Self-Elevation
- [x] App Mute-/Auto-Playback-Bug: Closure-Bug geloest (ttsCanPlayRef live-gespiegelt, nicht mehr stale)
- [x] App Zombie-Recording: Ohr-aus kill laufende Aufnahme damit der Aufnahme-Button weiter funktioniert
- [x] App Text-Rendering: Nachrichten selektierbar + Autolink fuer URLs/E-Mails/Telefonnummern (Browser/Mail/Dialer)
- [x] TTS-Wiedergabegeschwindigkeit pro Geraet einstellbar (Settings → 0.5-2.0x in 0.1-Schritten, Default 1.0)
- [x] Diagnostic: Voice-Preview-Modal (Play-Icon vor Delete-X, Textfeld mit Default, WAV im Browser abspielen)
## Offen ## Offen
### Bugs ### Bugs
- [ ] App: Wake-Word "jarvis" triggert nicht zuverlaessig (Porcupine-Debugging via ADB-Logcat ausstehend) - [ ] NO_REPLY wird als "NO" im Chat angezeigt — sollte still verworfen werden (Token nicht gesaeubert)
- [ ] App: Stuerzt beim Lauschen ab, eventuell bei Nebengeraeuschen (Porcupine + Mic-Race, errorCallback haelt's jetzt zurueck — Dauertest ausstehend)
### App Features ### App Features
- [ ] Wake Word on-device (Porcupine "ARIA" Keyword, Phase 2 — passives Lauschen)
- [ ] Chat-History zuverlaessiger laden (AsyncStorage Race Condition) - [ ] Chat-History zuverlaessiger laden (AsyncStorage Race Condition)
- [ ] Background Audio Service (TTS auch bei minimierter App) - [ ] Background Audio Service (TTS auch bei minimierter App)
### TTS / Audio
- [ ] Audio-Normalisierung (Lautstaerke zwischen Saetzen/Chunks angleichen)
- [ ] F5-TTS: Streaming-Inferenz testen (nativ statt satzweise) wenn ein passendes Backend kommt
- [ ] F5-TTS: Optional Deepspeed-Beschleunigung pruefen
### Architektur ### Architektur
- [ ] Bilder: Claude Vision direkt nutzen (aktuell nur Dateipfad an ARIA) - [ ] Bilder: Claude Vision direkt nutzen (aktuell nur Dateipfad an ARIA)
- [ ] Auto-Compacting und Memory/Brain Verwaltung (SQLite?) - [ ] Auto-Compacting und Memory/Brain Verwaltung (SQLite?)

View File

@ -237,8 +237,7 @@ class F5Runner:
else: else:
logger.info("F5-TTS Live-Config: cfg_strength=%.2f nfe=%d", new_cfg, new_nfe) logger.info("F5-TTS Live-Config: cfg_strength=%.2f nfe=%d", new_cfg, new_nfe)
def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str, def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
speed: float = 1.0) -> tuple[np.ndarray, int]:
wav, sr, _ = self.model.infer( wav, sr, _ = self.model.infer(
ref_file=ref_wav, ref_file=ref_wav,
ref_text=ref_text, ref_text=ref_text,
@ -247,7 +246,6 @@ class F5Runner:
seed=-1, seed=-1,
cfg_strength=self.cfg_strength, cfg_strength=self.cfg_strength,
nfe_step=self.nfe_step, nfe_step=self.nfe_step,
speed=speed,
) )
# F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard # F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard
if not isinstance(wav, np.ndarray): if not isinstance(wav, np.ndarray):
@ -256,11 +254,10 @@ class F5Runner:
wav = wav.squeeze() wav = wav.squeeze()
return wav.astype(np.float32), int(sr) return wav.astype(np.float32), int(sr)
async def synthesize(self, gen_text: str, ref_wav: str, ref_text: str, async def synthesize(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
speed: float = 1.0) -> tuple[np.ndarray, int]:
await self.ensure_loaded() await self.ensure_loaded()
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self._infer_blocking, gen_text, ref_wav, ref_text, speed) return await loop.run_in_executor(None, self._infer_blocking, gen_text, ref_wav, ref_text)
# ── Helpers ───────────────────────────────────────────────── # ── Helpers ─────────────────────────────────────────────────
@ -424,9 +421,9 @@ _tts_queue: asyncio.Queue[tuple] = asyncio.Queue()
async def _tts_worker(ws, runner: F5Runner) -> None: async def _tts_worker(ws, runner: F5Runner) -> None:
"""Serialisiert Synthesen — GPU kann sonst OOM gehen.""" """Serialisiert Synthesen — GPU kann sonst OOM gehen."""
while True: while True:
text, voice, request_id, message_id, language, speed = await _tts_queue.get() text, voice, request_id, message_id, language = await _tts_queue.get()
try: try:
await _do_tts(ws, runner, text, voice, request_id, message_id, language, speed) await _do_tts(ws, runner, text, voice, request_id, message_id, language)
except Exception: except Exception:
logger.exception("TTS-Worker Fehler") logger.exception("TTS-Worker Fehler")
finally: finally:
@ -434,8 +431,7 @@ async def _tts_worker(ws, runner: F5Runner) -> None:
async def _do_tts(ws, runner: F5Runner, text: str, voice: str, async def _do_tts(ws, runner: F5Runner, text: str, voice: str,
request_id: str, message_id: str, language: str, request_id: str, message_id: str, language: str) -> None:
speed: float = 1.0) -> None:
t0 = time.time() t0 = time.time()
ref_wav_path, ref_txt_path = voice_paths(voice) if voice else (None, None) ref_wav_path, ref_txt_path = voice_paths(voice) if voice else (None, None)
@ -513,7 +509,7 @@ async def _do_tts(ws, runner: F5Runner, text: str, voice: str,
pcm_sr = TARGET_SR pcm_sr = TARGET_SR
for i, sent in enumerate(sentences): for i, sent in enumerate(sentences):
try: try:
wav, sr = await runner.synthesize(sent, ref_wav_str, ref_text, speed) wav, sr = await runner.synthesize(sent, ref_wav_str, ref_text)
pcm_sr = sr pcm_sr = sr
pcm_bytes = float_to_pcm16(wav) pcm_bytes = float_to_pcm16(wav)
# Erste PCM-Chunk des allerersten Satzes bekommt Fade-In (maskiert # Erste PCM-Chunk des allerersten Satzes bekommt Fade-In (maskiert
@ -758,19 +754,12 @@ async def run_loop(runner: F5Runner) -> None:
payload = msg.get("payload", {}) or {} payload = msg.get("payload", {}) or {}
if mtype == "xtts_request": if mtype == "xtts_request":
try:
speed = float(payload.get("speed") or 1.0)
except (TypeError, ValueError):
speed = 1.0
if not (0.25 <= speed <= 4.0):
speed = 1.0
await _tts_queue.put(( await _tts_queue.put((
payload.get("text", ""), payload.get("text", ""),
payload.get("voice", "") or "", payload.get("voice", "") or "",
payload.get("requestId", ""), payload.get("requestId", ""),
payload.get("messageId", ""), payload.get("messageId", ""),
payload.get("language", "de"), payload.get("language", "de"),
speed,
)) ))
elif mtype == "voice_upload": elif mtype == "voice_upload":
asyncio.create_task(handle_voice_upload(ws, payload)) asyncio.create_task(handle_voice_upload(ws, payload))