diff --git a/android/src/components/VoiceCloneModal.tsx b/android/src/components/VoiceCloneModal.tsx new file mode 100644 index 0000000..9ede336 --- /dev/null +++ b/android/src/components/VoiceCloneModal.tsx @@ -0,0 +1,362 @@ +/** + * VoiceCloneModal — Eigene Stimme aufnehmen und an XTTS uploaden. + * + * Flow: + * - Modal zeigt Vorlesetext (>30s Lesedauer) + Aufnahme-Button + * - Bei Aufnahme: max 30s, Fortschrittsbalken, Countdown + * - Bei Stop: Name abfragen, dann als voice_upload ueber RVS schicken + * - XTTS-Bridge speichert /voices/.wav, antwortet mit xtts_voice_saved + */ + +import React, { useCallback, useEffect, useRef, useState } from 'react'; +import { + Modal, + View, + Text, + TouchableOpacity, + StyleSheet, + Alert, + ScrollView, + ActivityIndicator, + TextInput, +} from 'react-native'; +import audioService from '../services/audio'; +import rvs from '../services/rvs'; + +interface Props { + visible: boolean; + onClose: () => void; +} + +const SAMPLE_TEXT = `Das ist meine eigene Stimme fuer ARIA. Ich lese jetzt einen laengeren Absatz laut vor, damit das Voice-Cloning eine gute Grundlage hat. Guten Tag, ich heisse Stefan und baue gerade mit grosser Begeisterung an meinem persoenlichen KI-Assistenten. Wir automatisieren Infrastruktur, managen Sessions und spielen mit Sprachsynthese. Die letzten Jahre habe ich viel gelernt, vor allem dass Geduld genauso wichtig ist wie Neugier. Hoert sich das jetzt an wie ich selbst? Wenn alles klappt, spricht ARIA bald mit dieser Stimme.`; + +const MAX_DURATION_MS = 30000; +const TARGET_DURATION_MS = 15000; + +const VoiceCloneModal: React.FC = ({ visible, onClose }) => { + const [recording, setRecording] = useState(false); + const [durationMs, setDurationMs] = useState(0); + const [voiceName, setVoiceName] = useState(''); + const [processing, setProcessing] = useState(false); + const [recordingPath, setRecordingPath] = useState(''); + const timerRef = useRef | null>(null); + const startTimeRef = useRef(0); + + // Zustand zuruecksetzen wenn Modal schliesst/oeffnet + useEffect(() => { + if (!visible) { + setRecording(false); + setDurationMs(0); + setVoiceName(''); + setProcessing(false); + setRecordingPath(''); + if (timerRef.current) clearInterval(timerRef.current); + } + }, [visible]); + + // Cleanup bei Unmount + useEffect(() => { + return () => { + if (timerRef.current) clearInterval(timerRef.current); + if (recording) audioService.stopRecording().catch(() => {}); + }; + }, [recording]); + + const startRecording = useCallback(async () => { + // Frische Aufnahme + setDurationMs(0); + setRecordingPath(''); + const ok = await audioService.startRecording(false); + if (!ok) { + Alert.alert('Fehler', 'Aufnahme konnte nicht gestartet werden (Mikrofon-Berechtigung?)'); + return; + } + setRecording(true); + startTimeRef.current = Date.now(); + timerRef.current = setInterval(async () => { + const elapsed = Date.now() - startTimeRef.current; + setDurationMs(elapsed); + if (elapsed >= MAX_DURATION_MS) { + await stopRecording(); + } + }, 100); + }, []); + + const stopRecording = useCallback(async () => { + if (timerRef.current) { + clearInterval(timerRef.current); + timerRef.current = null; + } + if (!recording) return; + const result = await audioService.stopRecording(); + setRecording(false); + if (!result) { + Alert.alert('Keine Sprache erkannt', 'Versuch es bitte nochmal — sprich bis der Timer mindestens 10 Sekunden anzeigt.'); + setDurationMs(0); + return; + } + // Temp-Datei wurde schon geloescht (stopRecording cleaned up). + // Wir brauchen aber base64 aus result direkt fuers Upload. + // result.base64 ist bereits da. + setRecordingPath(result.base64); + }, [recording]); + + const uploadVoice = useCallback(async () => { + const name = voiceName.trim(); + if (!name) { + Alert.alert('Name fehlt', 'Bitte gib der Stimme einen Namen (nur Buchstaben, Zahlen, _ und -).'); + return; + } + if (!/^[a-zA-Z0-9_-]+$/.test(name)) { + Alert.alert('Ungueltiger Name', 'Nur Buchstaben, Zahlen, _ und - erlaubt.'); + return; + } + if (!recordingPath) { + Alert.alert('Keine Aufnahme', 'Bitte zuerst aufnehmen.'); + return; + } + setProcessing(true); + try { + // voice_upload erwartet samples als Array mit base64 (aus Diagnostic-Format kopiert) + rvs.send('voice_upload' as any, { + name, + samples: [{ base64: recordingPath }], + }); + Alert.alert('Hochgeladen', `Stimme "${name}" wird vom XTTS-Server verarbeitet. Nach ein paar Sekunden in der Liste verfuegbar.`); + onClose(); + } catch (err: any) { + Alert.alert('Fehler', err.message); + } finally { + setProcessing(false); + } + }, [voiceName, recordingPath, onClose]); + + const progress = Math.min(durationMs / MAX_DURATION_MS, 1); + const sec = Math.floor(durationMs / 1000); + const enoughRecorded = durationMs >= TARGET_DURATION_MS; + + return ( + + + + Eigene Stimme aufnehmen + + {'\u2715'} + + + + + + Lies den Text laut und deutlich vor. Maximal 30 Sekunden. Je mehr du sprichst + (ziel: bis zum Ende des Textes, ca. 20-30s), desto besser wird die geklonte + Stimme. + + + + {SAMPLE_TEXT} + + + {/* Timer + Fortschritt */} + + + {sec.toString().padStart(2, '0')} / 30 s + + + + + + + {/* Aufnahme-Button */} + {!recordingPath && ( + + {recording ? '\u25A0' : '\u25CF'} + {recording ? 'Stop' : 'Aufnahme starten'} + + )} + + {/* Nach Aufnahme: Name + Upload */} + {recordingPath && ( + + + Aufnahme ({sec}s) fertig. Vergib einen Namen und lade hoch. + + + + { setRecordingPath(''); setDurationMs(0); }} + > + Nochmal aufnehmen + + + {processing + ? + : Hochladen + } + + + + )} + + {recording && !enoughRecorded && ( + + Bitte weiter lesen — mindestens 15 Sekunden + + )} + + {recording && enoughRecorded && ( + + Genug Audio fuer eine gute Clonung. Du kannst stoppen. + + )} + + + + ); +}; + +const styles = StyleSheet.create({ + container: { + flex: 1, + backgroundColor: '#0D0D1A', + }, + header: { + flexDirection: 'row', + alignItems: 'center', + justifyContent: 'space-between', + paddingHorizontal: 16, + paddingTop: 48, + paddingBottom: 16, + borderBottomWidth: 1, + borderBottomColor: '#1E1E2E', + }, + title: { + color: '#FFFFFF', + fontSize: 18, + fontWeight: '700', + }, + closeX: { + color: '#8888AA', + fontSize: 24, + paddingHorizontal: 8, + }, + content: { + flex: 1, + }, + hint: { + color: '#8888AA', + fontSize: 13, + lineHeight: 20, + }, + sampleTextBox: { + marginTop: 12, + padding: 14, + backgroundColor: '#12122A', + borderRadius: 10, + borderWidth: 1, + borderColor: '#1E1E2E', + }, + sampleText: { + color: '#E0E0F0', + fontSize: 15, + lineHeight: 24, + }, + timer: { + color: '#666680', + fontSize: 42, + fontWeight: '700', + fontVariant: ['tabular-nums'], + }, + timerActive: { + color: '#FF3B30', + }, + progressBar: { + marginTop: 8, + width: '100%', + height: 8, + backgroundColor: '#1E1E2E', + borderRadius: 4, + overflow: 'hidden', + }, + progressFill: { + height: '100%', + }, + recordBtn: { + marginTop: 24, + flexDirection: 'row', + alignItems: 'center', + justifyContent: 'center', + gap: 12, + backgroundColor: '#1E1E2E', + borderRadius: 12, + padding: 18, + borderWidth: 2, + borderColor: '#34C759', + }, + recordBtnActive: { + borderColor: '#FF3B30', + backgroundColor: 'rgba(255,59,48,0.15)', + }, + recordIcon: { + color: '#FF3B30', + fontSize: 24, + fontWeight: '700', + }, + recordLabel: { + color: '#FFFFFF', + fontSize: 17, + fontWeight: '600', + }, + nameInput: { + marginTop: 10, + backgroundColor: '#1E1E2E', + borderRadius: 8, + paddingHorizontal: 14, + paddingVertical: 12, + color: '#FFFFFF', + fontSize: 15, + borderWidth: 1, + borderColor: '#2A2A3E', + }, + primaryBtn: { + backgroundColor: '#0096FF', + borderRadius: 10, + padding: 14, + alignItems: 'center', + }, + primaryBtnText: { + color: '#FFFFFF', + fontSize: 15, + fontWeight: '700', + }, + secondaryBtn: { + backgroundColor: '#1E1E2E', + borderRadius: 10, + padding: 14, + alignItems: 'center', + borderWidth: 1, + borderColor: '#2A2A3E', + }, + secondaryBtnText: { + color: '#8888AA', + fontSize: 14, + fontWeight: '600', + }, +}); + +export default VoiceCloneModal; diff --git a/android/src/screens/ChatScreen.tsx b/android/src/screens/ChatScreen.tsx index ed68f87..9bf1240 100644 --- a/android/src/screens/ChatScreen.tsx +++ b/android/src/screens/ChatScreen.tsx @@ -110,6 +110,8 @@ const ChatScreen: React.FC = () => { // Gerätelokale TTS-Config: globaler Toggle (aus Settings) + temporäres Muten (Mund-Button) const [ttsDeviceEnabled, setTtsDeviceEnabled] = useState(true); const [ttsMuted, setTtsMuted] = useState(false); + // Gerätelokale XTTS-Voice-Wahl (bevorzugt gegenueber dem globalen Default) + const localXttsVoiceRef = useRef(''); const flatListRef = useRef(null); const messageIdCounter = useRef(0); @@ -127,6 +129,8 @@ const ChatScreen: React.FC = () => { setTtsDeviceEnabled(enabled !== 'false'); // default true const muted = await AsyncStorage.getItem('aria_tts_muted'); setTtsMuted(muted === 'true'); // default false + const voice = await AsyncStorage.getItem('aria_xtts_voice'); + localXttsVoiceRef.current = voice || ''; }; loadTtsSettings(); // Poll alle 2s um Settings-Aenderung mitzubekommen (einfache Loesung ohne Context) @@ -386,6 +390,7 @@ const ChatScreen: React.FC = () => { base64: result.base64, durationMs: result.durationMs, mimeType: result.mimeType, + voice: localXttsVoiceRef.current, ...(location && { location }), }); } @@ -488,9 +493,10 @@ const ChatScreen: React.FC = () => { }; setMessages(prev => capMessages([...prev, userMsg])); - // An RVS senden + // An RVS senden — mit geraetelokaler Voice (Bridge nutzt sie fuer die Antwort) rvs.send('chat', { text, + voice: localXttsVoiceRef.current, ...(location && { location }), }); }, [inputText, getCurrentLocation, pendingAttachments, sendPendingAttachments]); @@ -599,6 +605,7 @@ const ChatScreen: React.FC = () => { if (messageText) { rvs.send('chat', { text: messageText, + voice: localXttsVoiceRef.current, ...(location && { location }), }); } @@ -689,7 +696,7 @@ const ChatScreen: React.FC = () => { // wieder mit der Nachricht verknuepft (fuer den naechsten Replay aus Cache) rvs.send('tts_request' as any, { text: item.text, - voice: '', + voice: localXttsVoiceRef.current, messageId: item.messageId || '', }); } diff --git a/android/src/screens/SettingsScreen.tsx b/android/src/screens/SettingsScreen.tsx index ac1a645..1d09252 100644 --- a/android/src/screens/SettingsScreen.tsx +++ b/android/src/screens/SettingsScreen.tsx @@ -22,6 +22,7 @@ import DocumentPicker from 'react-native-document-picker'; import rvs, { ConnectionState, RVSMessage, ConnectionConfig, ConnectionLogEntry } from '../services/rvs'; import ModeSelector from '../components/ModeSelector'; import QRScanner from '../components/QRScanner'; +import VoiceCloneModal from '../components/VoiceCloneModal'; const STORAGE_PATH_KEY = 'aria_attachment_storage_path'; const DEFAULT_STORAGE_PATH = `${RNFS.DocumentDirectoryPath}/chat_attachments`; @@ -73,6 +74,9 @@ const SettingsScreen: React.FC = () => { const [storageSize, setStorageSize] = useState('...'); const [ttsEnabled, setTtsEnabled] = useState(true); const [editingPath, setEditingPath] = useState(false); + const [xttsVoice, setXttsVoice] = useState(''); + const [availableVoices, setAvailableVoices] = useState>([]); + const [voiceCloneVisible, setVoiceCloneVisible] = useState(false); const [tempPath, setTempPath] = useState(''); let logIdCounter = 0; @@ -95,6 +99,11 @@ const SettingsScreen: React.FC = () => { AsyncStorage.getItem('aria_tts_enabled').then(saved => { if (saved !== null) setTtsEnabled(saved === 'true'); }); + AsyncStorage.getItem('aria_xtts_voice').then(saved => { + if (saved) setXttsVoice(saved); + }); + // Voice-Liste vom XTTS-Server holen (via RVS) + rvs.send('xtts_list_voices' as any, {}); }, []); // Speichergroesse berechnen @@ -225,6 +234,22 @@ const SettingsScreen: React.FC = () => { const mode = message.payload.mode as string; if (mode) setCurrentMode(mode); } + + // XTTS-Voice-Liste + if (message.type === ('xtts_voices_list' as any)) { + const voices = ((message.payload as any).voices || []) as Array<{name: string, size: number}>; + setAvailableVoices(voices); + } + + // Voice wurde gespeichert → Liste neu laden + ggf. auswaehlen + if (message.type === ('xtts_voice_saved' as any)) { + const name = (message.payload as any).name as string; + if (name) { + setXttsVoice(name); + AsyncStorage.setItem('aria_xtts_voice', name); + } + rvs.send('xtts_list_voices' as any, {}); + } }); return () => { @@ -288,6 +313,36 @@ const SettingsScreen: React.FC = () => { // In Produktion: Wert in AsyncStorage persistieren }, []); + // --- XTTS Voice --- + + const selectVoice = useCallback((voiceName: string) => { + setXttsVoice(voiceName); + AsyncStorage.setItem('aria_xtts_voice', voiceName); + }, []); + + const deleteVoice = useCallback((name: string) => { + Alert.alert( + 'Stimme loeschen', + `Stimme "${name}" vom Server endgueltig loeschen?\nAlle Apps verlieren sie.`, + [ + { text: 'Abbrechen', style: 'cancel' }, + { + text: 'Loeschen', + style: 'destructive', + onPress: () => { + rvs.send('xtts_delete_voice' as any, { name }); + if (xttsVoice === name) { + setXttsVoice(''); + AsyncStorage.setItem('aria_xtts_voice', ''); + } + // Liste nach kurzer Wartezeit neu laden (XTTS-Bridge schickt eh neue Liste) + setTimeout(() => rvs.send('xtts_list_voices' as any, {}), 500); + }, + }, + ], + ); + }, [xttsVoice]); + // --- Modus aendern --- const handleModeChange = useCallback((modeId: string) => { @@ -321,6 +376,10 @@ const SettingsScreen: React.FC = () => { onScan={handleQRScan} onClose={() => setScannerVisible(false)} /> + setVoiceCloneVisible(false)} + /> {/* === Verbindung === */} @@ -455,7 +514,6 @@ const SettingsScreen: React.FC = () => { Nur lokal — andere Geraete sind unabhaengig. Wenn aus, erscheint im Chat auch kein Mund-Button. - Stimme und Voice-Cloning werden zentral in der Diagnose eingestellt. { thumbColor={ttsEnabled ? '#FFFFFF' : '#666680'} /> + + {ttsEnabled && ( + + Stimme (geraetelokal) + + Eigene Wahl fuer dieses Geraet. Ohne Auswahl gilt der Diagnostic-Default. + + + {/* Default-Option */} + selectVoice('')} + > + + Standard (Diagnostic-Default) + + {xttsVoice === '' && {'\u2713'}} + + + {availableVoices.length === 0 ? ( + + Keine eigenen Stimmen auf dem XTTS-Server. + + ) : ( + availableVoices.map(v => ( + + selectVoice(v.name)} + > + + {v.name} + + {(v.size / 1024).toFixed(0)} KB + + {xttsVoice === v.name && {'\u2713'}} + deleteVoice(v.name)} style={styles.voiceRowDelete}> + X + + + )) + )} + + + setVoiceCloneVisible(true)} + > + {'\uD83C\uDFA4'} Eigene Stimme aufnehmen + + rvs.send('xtts_list_voices' as any, {})} + > + Aktualisieren + + + + )} {/* === Speicher === */} @@ -782,6 +899,55 @@ const styles = StyleSheet.create({ marginTop: 2, }, + // XTTS Voice List + voiceRow: { + flexDirection: 'row', + alignItems: 'center', + backgroundColor: '#1E1E2E', + borderRadius: 8, + padding: 10, + marginTop: 6, + borderWidth: 1, + borderColor: 'transparent', + }, + voiceRowActive: { + borderColor: '#0096FF', + backgroundColor: '#0D1A2E', + }, + voiceRowName: { + color: '#CCCCDD', + fontSize: 14, + fontWeight: '500', + }, + voiceRowNameActive: { + color: '#FFFFFF', + }, + voiceRowMeta: { + color: '#666680', + fontSize: 11, + marginTop: 2, + }, + voiceRowCheck: { + color: '#34C759', + fontSize: 16, + fontWeight: '700', + marginHorizontal: 6, + }, + voiceRowDelete: { + width: 28, + height: 28, + borderRadius: 14, + backgroundColor: 'rgba(255,59,48,0.2)', + alignItems: 'center', + justifyContent: 'center', + marginLeft: 4, + }, + voiceRowDeleteIcon: { + color: '#FF3B30', + fontSize: 12, + fontWeight: '700', + }, + // Stimmen voiceBtn: { flex: 1, diff --git a/bridge/aria_bridge.py b/bridge/aria_bridge.py index 0a31319..ed3d7d9 100644 --- a/bridge/aria_bridge.py +++ b/bridge/aria_bridge.py @@ -1291,6 +1291,11 @@ class ARIABridge: if not audio_b64: logger.warning("[rvs] Audio ohne Daten empfangen") return + # Voice-Override fuer die kommende ARIA-Antwort (App-lokal gewaehlt) + voice_override = payload.get("voice", "") + if voice_override: + self._next_voice_override = voice_override + logger.info("[rvs] Voice-Override (via Audio): %s", voice_override) logger.info("[rvs] Audio empfangen: %s, %dms, %dKB", mime_type, duration_ms, len(audio_b64) // 1365) asyncio.create_task(self._process_app_audio(audio_b64, mime_type))