feat: App XTTS-Voice-Auswahl + Aufnahme + Loeschen (geraetelokal)

App Settings: Voice-Sektion (nur wenn TTS an)
- Liste aller XTTS-Server-Stimmen mit Auswahl-Radio + X zum Loeschen
- 'Standard' fuer Diagnostic-Default-Voice (keine lokale Ueberschreibung)
- 'Aktualisieren' Button laedt Liste neu (xtts_list_voices via RVS)
- 'Eigene Stimme aufnehmen' oeffnet VoiceCloneModal

VoiceCloneModal: 30s Aufnahme + Upload
- Vorlese-Text (>30s Lesedauer, thematisch passend)
- Rot-pulsierender Stop-Button, live Timer + Progressbar
- Auto-Stop bei 30s, Hinweise ab 15s ('genug fuer gute Clonung')
- Nach Stop: Namenseingabe (a-Z, 0-9, _, -), Upload via voice_upload
- Nach Upload: Modal schliesst, Settings bekommt xtts_voice_saved
  und setzt automatisch die neue Stimme als gewaehlt

Voice-Flow App → Bridge → XTTS (geraetelokal):
- Jeder chat/audio/tts_request schickt aria_xtts_voice (AsyncStorage)
  mit der Message mit
- Bridge speichert _next_voice_override bei chat/audio Empfang,
  nutzt es fuer die naechste ARIA-Antwort und resettet dann
- Fallback: globale xtts_voice aus voice_config.json (Diagnostic)

Ergebnis:
- Gerat A hat 'stefan' geclont → ARIA antwortet Geraet A mit stefan
- Gerat B hat nichts gewaehlt → ARIA antwortet Geraet B mit Default
- Diagnostic-Einstellung wirkt als fallback-default fuer neue Geraete

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
duffyduck 2026-04-19 22:48:24 +02:00
parent fc2438be2d
commit 99cb83202e
4 changed files with 543 additions and 3 deletions

View File

@ -0,0 +1,362 @@
/**
* VoiceCloneModal Eigene Stimme aufnehmen und an XTTS uploaden.
*
* Flow:
* - Modal zeigt Vorlesetext (>30s Lesedauer) + Aufnahme-Button
* - Bei Aufnahme: max 30s, Fortschrittsbalken, Countdown
* - Bei Stop: Name abfragen, dann als voice_upload ueber RVS schicken
* - XTTS-Bridge speichert /voices/<name>.wav, antwortet mit xtts_voice_saved
*/
import React, { useCallback, useEffect, useRef, useState } from 'react';
import {
Modal,
View,
Text,
TouchableOpacity,
StyleSheet,
Alert,
ScrollView,
ActivityIndicator,
TextInput,
} from 'react-native';
import audioService from '../services/audio';
import rvs from '../services/rvs';
interface Props {
visible: boolean;
onClose: () => void;
}
const SAMPLE_TEXT = `Das ist meine eigene Stimme fuer ARIA. Ich lese jetzt einen laengeren Absatz laut vor, damit das Voice-Cloning eine gute Grundlage hat. Guten Tag, ich heisse Stefan und baue gerade mit grosser Begeisterung an meinem persoenlichen KI-Assistenten. Wir automatisieren Infrastruktur, managen Sessions und spielen mit Sprachsynthese. Die letzten Jahre habe ich viel gelernt, vor allem dass Geduld genauso wichtig ist wie Neugier. Hoert sich das jetzt an wie ich selbst? Wenn alles klappt, spricht ARIA bald mit dieser Stimme.`;
const MAX_DURATION_MS = 30000;
const TARGET_DURATION_MS = 15000;
const VoiceCloneModal: React.FC<Props> = ({ visible, onClose }) => {
const [recording, setRecording] = useState(false);
const [durationMs, setDurationMs] = useState(0);
const [voiceName, setVoiceName] = useState('');
const [processing, setProcessing] = useState(false);
const [recordingPath, setRecordingPath] = useState('');
const timerRef = useRef<ReturnType<typeof setInterval> | null>(null);
const startTimeRef = useRef<number>(0);
// Zustand zuruecksetzen wenn Modal schliesst/oeffnet
useEffect(() => {
if (!visible) {
setRecording(false);
setDurationMs(0);
setVoiceName('');
setProcessing(false);
setRecordingPath('');
if (timerRef.current) clearInterval(timerRef.current);
}
}, [visible]);
// Cleanup bei Unmount
useEffect(() => {
return () => {
if (timerRef.current) clearInterval(timerRef.current);
if (recording) audioService.stopRecording().catch(() => {});
};
}, [recording]);
const startRecording = useCallback(async () => {
// Frische Aufnahme
setDurationMs(0);
setRecordingPath('');
const ok = await audioService.startRecording(false);
if (!ok) {
Alert.alert('Fehler', 'Aufnahme konnte nicht gestartet werden (Mikrofon-Berechtigung?)');
return;
}
setRecording(true);
startTimeRef.current = Date.now();
timerRef.current = setInterval(async () => {
const elapsed = Date.now() - startTimeRef.current;
setDurationMs(elapsed);
if (elapsed >= MAX_DURATION_MS) {
await stopRecording();
}
}, 100);
}, []);
const stopRecording = useCallback(async () => {
if (timerRef.current) {
clearInterval(timerRef.current);
timerRef.current = null;
}
if (!recording) return;
const result = await audioService.stopRecording();
setRecording(false);
if (!result) {
Alert.alert('Keine Sprache erkannt', 'Versuch es bitte nochmal — sprich bis der Timer mindestens 10 Sekunden anzeigt.');
setDurationMs(0);
return;
}
// Temp-Datei wurde schon geloescht (stopRecording cleaned up).
// Wir brauchen aber base64 aus result direkt fuers Upload.
// result.base64 ist bereits da.
setRecordingPath(result.base64);
}, [recording]);
const uploadVoice = useCallback(async () => {
const name = voiceName.trim();
if (!name) {
Alert.alert('Name fehlt', 'Bitte gib der Stimme einen Namen (nur Buchstaben, Zahlen, _ und -).');
return;
}
if (!/^[a-zA-Z0-9_-]+$/.test(name)) {
Alert.alert('Ungueltiger Name', 'Nur Buchstaben, Zahlen, _ und - erlaubt.');
return;
}
if (!recordingPath) {
Alert.alert('Keine Aufnahme', 'Bitte zuerst aufnehmen.');
return;
}
setProcessing(true);
try {
// voice_upload erwartet samples als Array mit base64 (aus Diagnostic-Format kopiert)
rvs.send('voice_upload' as any, {
name,
samples: [{ base64: recordingPath }],
});
Alert.alert('Hochgeladen', `Stimme "${name}" wird vom XTTS-Server verarbeitet. Nach ein paar Sekunden in der Liste verfuegbar.`);
onClose();
} catch (err: any) {
Alert.alert('Fehler', err.message);
} finally {
setProcessing(false);
}
}, [voiceName, recordingPath, onClose]);
const progress = Math.min(durationMs / MAX_DURATION_MS, 1);
const sec = Math.floor(durationMs / 1000);
const enoughRecorded = durationMs >= TARGET_DURATION_MS;
return (
<Modal visible={visible} animationType="slide" onRequestClose={onClose}>
<View style={styles.container}>
<View style={styles.header}>
<Text style={styles.title}>Eigene Stimme aufnehmen</Text>
<TouchableOpacity onPress={onClose}>
<Text style={styles.closeX}>{'\u2715'}</Text>
</TouchableOpacity>
</View>
<ScrollView style={styles.content} contentContainerStyle={{padding: 16}}>
<Text style={styles.hint}>
Lies den Text laut und deutlich vor. Maximal 30 Sekunden. Je mehr du sprichst
(ziel: bis zum Ende des Textes, ca. 20-30s), desto besser wird die geklonte
Stimme.
</Text>
<View style={styles.sampleTextBox}>
<Text style={styles.sampleText}>{SAMPLE_TEXT}</Text>
</View>
{/* Timer + Fortschritt */}
<View style={{marginTop: 20, alignItems: 'center'}}>
<Text style={[styles.timer, recording && styles.timerActive]}>
{sec.toString().padStart(2, '0')} / 30 s
</Text>
<View style={styles.progressBar}>
<View style={[styles.progressFill, {width: `${progress * 100}%`, backgroundColor: recording ? '#FF3B30' : '#0096FF'}]} />
</View>
</View>
{/* Aufnahme-Button */}
{!recordingPath && (
<TouchableOpacity
style={[styles.recordBtn, recording && styles.recordBtnActive]}
onPress={recording ? stopRecording : startRecording}
>
<Text style={styles.recordIcon}>{recording ? '\u25A0' : '\u25CF'}</Text>
<Text style={styles.recordLabel}>{recording ? 'Stop' : 'Aufnahme starten'}</Text>
</TouchableOpacity>
)}
{/* Nach Aufnahme: Name + Upload */}
{recordingPath && (
<View style={{marginTop: 20}}>
<Text style={styles.hint}>
Aufnahme ({sec}s) fertig. Vergib einen Namen und lade hoch.
</Text>
<TextInput
style={styles.nameInput}
value={voiceName}
onChangeText={setVoiceName}
placeholder="z.B. stefan"
placeholderTextColor="#555570"
autoCapitalize="none"
autoCorrect={false}
/>
<View style={{flexDirection: 'row', gap: 8, marginTop: 12}}>
<TouchableOpacity
style={[styles.secondaryBtn, {flex: 1}]}
onPress={() => { setRecordingPath(''); setDurationMs(0); }}
>
<Text style={styles.secondaryBtnText}>Nochmal aufnehmen</Text>
</TouchableOpacity>
<TouchableOpacity
style={[styles.primaryBtn, {flex: 1}]}
onPress={uploadVoice}
disabled={processing}
>
{processing
? <ActivityIndicator color="#fff" />
: <Text style={styles.primaryBtnText}>Hochladen</Text>
}
</TouchableOpacity>
</View>
</View>
)}
{recording && !enoughRecorded && (
<Text style={[styles.hint, {marginTop: 12, color: '#FFD60A', textAlign: 'center'}]}>
Bitte weiter lesen mindestens 15 Sekunden
</Text>
)}
{recording && enoughRecorded && (
<Text style={[styles.hint, {marginTop: 12, color: '#34C759', textAlign: 'center'}]}>
Genug Audio fuer eine gute Clonung. Du kannst stoppen.
</Text>
)}
</ScrollView>
</View>
</Modal>
);
};
const styles = StyleSheet.create({
container: {
flex: 1,
backgroundColor: '#0D0D1A',
},
header: {
flexDirection: 'row',
alignItems: 'center',
justifyContent: 'space-between',
paddingHorizontal: 16,
paddingTop: 48,
paddingBottom: 16,
borderBottomWidth: 1,
borderBottomColor: '#1E1E2E',
},
title: {
color: '#FFFFFF',
fontSize: 18,
fontWeight: '700',
},
closeX: {
color: '#8888AA',
fontSize: 24,
paddingHorizontal: 8,
},
content: {
flex: 1,
},
hint: {
color: '#8888AA',
fontSize: 13,
lineHeight: 20,
},
sampleTextBox: {
marginTop: 12,
padding: 14,
backgroundColor: '#12122A',
borderRadius: 10,
borderWidth: 1,
borderColor: '#1E1E2E',
},
sampleText: {
color: '#E0E0F0',
fontSize: 15,
lineHeight: 24,
},
timer: {
color: '#666680',
fontSize: 42,
fontWeight: '700',
fontVariant: ['tabular-nums'],
},
timerActive: {
color: '#FF3B30',
},
progressBar: {
marginTop: 8,
width: '100%',
height: 8,
backgroundColor: '#1E1E2E',
borderRadius: 4,
overflow: 'hidden',
},
progressFill: {
height: '100%',
},
recordBtn: {
marginTop: 24,
flexDirection: 'row',
alignItems: 'center',
justifyContent: 'center',
gap: 12,
backgroundColor: '#1E1E2E',
borderRadius: 12,
padding: 18,
borderWidth: 2,
borderColor: '#34C759',
},
recordBtnActive: {
borderColor: '#FF3B30',
backgroundColor: 'rgba(255,59,48,0.15)',
},
recordIcon: {
color: '#FF3B30',
fontSize: 24,
fontWeight: '700',
},
recordLabel: {
color: '#FFFFFF',
fontSize: 17,
fontWeight: '600',
},
nameInput: {
marginTop: 10,
backgroundColor: '#1E1E2E',
borderRadius: 8,
paddingHorizontal: 14,
paddingVertical: 12,
color: '#FFFFFF',
fontSize: 15,
borderWidth: 1,
borderColor: '#2A2A3E',
},
primaryBtn: {
backgroundColor: '#0096FF',
borderRadius: 10,
padding: 14,
alignItems: 'center',
},
primaryBtnText: {
color: '#FFFFFF',
fontSize: 15,
fontWeight: '700',
},
secondaryBtn: {
backgroundColor: '#1E1E2E',
borderRadius: 10,
padding: 14,
alignItems: 'center',
borderWidth: 1,
borderColor: '#2A2A3E',
},
secondaryBtnText: {
color: '#8888AA',
fontSize: 14,
fontWeight: '600',
},
});
export default VoiceCloneModal;

View File

@ -110,6 +110,8 @@ const ChatScreen: React.FC = () => {
// Gerätelokale TTS-Config: globaler Toggle (aus Settings) + temporäres Muten (Mund-Button)
const [ttsDeviceEnabled, setTtsDeviceEnabled] = useState(true);
const [ttsMuted, setTtsMuted] = useState(false);
// Gerätelokale XTTS-Voice-Wahl (bevorzugt gegenueber dem globalen Default)
const localXttsVoiceRef = useRef<string>('');
const flatListRef = useRef<FlatList>(null);
const messageIdCounter = useRef(0);
@ -127,6 +129,8 @@ const ChatScreen: React.FC = () => {
setTtsDeviceEnabled(enabled !== 'false'); // default true
const muted = await AsyncStorage.getItem('aria_tts_muted');
setTtsMuted(muted === 'true'); // default false
const voice = await AsyncStorage.getItem('aria_xtts_voice');
localXttsVoiceRef.current = voice || '';
};
loadTtsSettings();
// Poll alle 2s um Settings-Aenderung mitzubekommen (einfache Loesung ohne Context)
@ -386,6 +390,7 @@ const ChatScreen: React.FC = () => {
base64: result.base64,
durationMs: result.durationMs,
mimeType: result.mimeType,
voice: localXttsVoiceRef.current,
...(location && { location }),
});
}
@ -488,9 +493,10 @@ const ChatScreen: React.FC = () => {
};
setMessages(prev => capMessages([...prev, userMsg]));
// An RVS senden
// An RVS senden — mit geraetelokaler Voice (Bridge nutzt sie fuer die Antwort)
rvs.send('chat', {
text,
voice: localXttsVoiceRef.current,
...(location && { location }),
});
}, [inputText, getCurrentLocation, pendingAttachments, sendPendingAttachments]);
@ -599,6 +605,7 @@ const ChatScreen: React.FC = () => {
if (messageText) {
rvs.send('chat', {
text: messageText,
voice: localXttsVoiceRef.current,
...(location && { location }),
});
}
@ -689,7 +696,7 @@ const ChatScreen: React.FC = () => {
// wieder mit der Nachricht verknuepft (fuer den naechsten Replay aus Cache)
rvs.send('tts_request' as any, {
text: item.text,
voice: '',
voice: localXttsVoiceRef.current,
messageId: item.messageId || '',
});
}

View File

@ -22,6 +22,7 @@ import DocumentPicker from 'react-native-document-picker';
import rvs, { ConnectionState, RVSMessage, ConnectionConfig, ConnectionLogEntry } from '../services/rvs';
import ModeSelector from '../components/ModeSelector';
import QRScanner from '../components/QRScanner';
import VoiceCloneModal from '../components/VoiceCloneModal';
const STORAGE_PATH_KEY = 'aria_attachment_storage_path';
const DEFAULT_STORAGE_PATH = `${RNFS.DocumentDirectoryPath}/chat_attachments`;
@ -73,6 +74,9 @@ const SettingsScreen: React.FC = () => {
const [storageSize, setStorageSize] = useState('...');
const [ttsEnabled, setTtsEnabled] = useState(true);
const [editingPath, setEditingPath] = useState(false);
const [xttsVoice, setXttsVoice] = useState('');
const [availableVoices, setAvailableVoices] = useState<Array<{name: string, size: number}>>([]);
const [voiceCloneVisible, setVoiceCloneVisible] = useState(false);
const [tempPath, setTempPath] = useState('');
let logIdCounter = 0;
@ -95,6 +99,11 @@ const SettingsScreen: React.FC = () => {
AsyncStorage.getItem('aria_tts_enabled').then(saved => {
if (saved !== null) setTtsEnabled(saved === 'true');
});
AsyncStorage.getItem('aria_xtts_voice').then(saved => {
if (saved) setXttsVoice(saved);
});
// Voice-Liste vom XTTS-Server holen (via RVS)
rvs.send('xtts_list_voices' as any, {});
}, []);
// Speichergroesse berechnen
@ -225,6 +234,22 @@ const SettingsScreen: React.FC = () => {
const mode = message.payload.mode as string;
if (mode) setCurrentMode(mode);
}
// XTTS-Voice-Liste
if (message.type === ('xtts_voices_list' as any)) {
const voices = ((message.payload as any).voices || []) as Array<{name: string, size: number}>;
setAvailableVoices(voices);
}
// Voice wurde gespeichert → Liste neu laden + ggf. auswaehlen
if (message.type === ('xtts_voice_saved' as any)) {
const name = (message.payload as any).name as string;
if (name) {
setXttsVoice(name);
AsyncStorage.setItem('aria_xtts_voice', name);
}
rvs.send('xtts_list_voices' as any, {});
}
});
return () => {
@ -288,6 +313,36 @@ const SettingsScreen: React.FC = () => {
// In Produktion: Wert in AsyncStorage persistieren
}, []);
// --- XTTS Voice ---
const selectVoice = useCallback((voiceName: string) => {
setXttsVoice(voiceName);
AsyncStorage.setItem('aria_xtts_voice', voiceName);
}, []);
const deleteVoice = useCallback((name: string) => {
Alert.alert(
'Stimme loeschen',
`Stimme "${name}" vom Server endgueltig loeschen?\nAlle Apps verlieren sie.`,
[
{ text: 'Abbrechen', style: 'cancel' },
{
text: 'Loeschen',
style: 'destructive',
onPress: () => {
rvs.send('xtts_delete_voice' as any, { name });
if (xttsVoice === name) {
setXttsVoice('');
AsyncStorage.setItem('aria_xtts_voice', '');
}
// Liste nach kurzer Wartezeit neu laden (XTTS-Bridge schickt eh neue Liste)
setTimeout(() => rvs.send('xtts_list_voices' as any, {}), 500);
},
},
],
);
}, [xttsVoice]);
// --- Modus aendern ---
const handleModeChange = useCallback((modeId: string) => {
@ -321,6 +376,10 @@ const SettingsScreen: React.FC = () => {
onScan={handleQRScan}
onClose={() => setScannerVisible(false)}
/>
<VoiceCloneModal
visible={voiceCloneVisible}
onClose={() => setVoiceCloneVisible(false)}
/>
<ScrollView style={styles.container} contentContainerStyle={styles.content}>
{/* === Verbindung === */}
@ -455,7 +514,6 @@ const SettingsScreen: React.FC = () => {
<Text style={styles.toggleHint}>
Nur lokal andere Geraete sind unabhaengig.
Wenn aus, erscheint im Chat auch kein Mund-Button.
Stimme und Voice-Cloning werden zentral in der Diagnose eingestellt.
</Text>
</View>
<Switch
@ -468,6 +526,65 @@ const SettingsScreen: React.FC = () => {
thumbColor={ttsEnabled ? '#FFFFFF' : '#666680'}
/>
</View>
{ttsEnabled && (
<View style={{marginTop: 20}}>
<Text style={styles.toggleLabel}>Stimme (geraetelokal)</Text>
<Text style={styles.toggleHint}>
Eigene Wahl fuer dieses Geraet. Ohne Auswahl gilt der Diagnostic-Default.
</Text>
{/* Default-Option */}
<TouchableOpacity
style={[styles.voiceRow, xttsVoice === '' && styles.voiceRowActive]}
onPress={() => selectVoice('')}
>
<Text style={[styles.voiceRowName, xttsVoice === '' && styles.voiceRowNameActive]}>
Standard (Diagnostic-Default)
</Text>
{xttsVoice === '' && <Text style={styles.voiceRowCheck}>{'\u2713'}</Text>}
</TouchableOpacity>
{availableVoices.length === 0 ? (
<Text style={[styles.toggleHint, {marginTop: 8, textAlign: 'center'}]}>
Keine eigenen Stimmen auf dem XTTS-Server.
</Text>
) : (
availableVoices.map(v => (
<View key={v.name} style={[styles.voiceRow, xttsVoice === v.name && styles.voiceRowActive]}>
<TouchableOpacity
style={{flex: 1}}
onPress={() => selectVoice(v.name)}
>
<Text style={[styles.voiceRowName, xttsVoice === v.name && styles.voiceRowNameActive]}>
{v.name}
</Text>
<Text style={styles.voiceRowMeta}>{(v.size / 1024).toFixed(0)} KB</Text>
</TouchableOpacity>
{xttsVoice === v.name && <Text style={styles.voiceRowCheck}>{'\u2713'}</Text>}
<TouchableOpacity onPress={() => deleteVoice(v.name)} style={styles.voiceRowDelete}>
<Text style={styles.voiceRowDeleteIcon}>X</Text>
</TouchableOpacity>
</View>
))
)}
<View style={{flexDirection: 'row', gap: 8, marginTop: 12}}>
<TouchableOpacity
style={[styles.connectButton, {flex: 1}]}
onPress={() => setVoiceCloneVisible(true)}
>
<Text style={styles.connectButtonText}>{'\uD83C\uDFA4'} Eigene Stimme aufnehmen</Text>
</TouchableOpacity>
<TouchableOpacity
style={[styles.clearButton, {flex: 0.4, marginTop: 0}]}
onPress={() => rvs.send('xtts_list_voices' as any, {})}
>
<Text style={styles.clearButtonText}>Aktualisieren</Text>
</TouchableOpacity>
</View>
</View>
)}
</View>
{/* === Speicher === */}
@ -782,6 +899,55 @@ const styles = StyleSheet.create({
marginTop: 2,
},
// XTTS Voice List
voiceRow: {
flexDirection: 'row',
alignItems: 'center',
backgroundColor: '#1E1E2E',
borderRadius: 8,
padding: 10,
marginTop: 6,
borderWidth: 1,
borderColor: 'transparent',
},
voiceRowActive: {
borderColor: '#0096FF',
backgroundColor: '#0D1A2E',
},
voiceRowName: {
color: '#CCCCDD',
fontSize: 14,
fontWeight: '500',
},
voiceRowNameActive: {
color: '#FFFFFF',
},
voiceRowMeta: {
color: '#666680',
fontSize: 11,
marginTop: 2,
},
voiceRowCheck: {
color: '#34C759',
fontSize: 16,
fontWeight: '700',
marginHorizontal: 6,
},
voiceRowDelete: {
width: 28,
height: 28,
borderRadius: 14,
backgroundColor: 'rgba(255,59,48,0.2)',
alignItems: 'center',
justifyContent: 'center',
marginLeft: 4,
},
voiceRowDeleteIcon: {
color: '#FF3B30',
fontSize: 12,
fontWeight: '700',
},
// Stimmen
voiceBtn: {
flex: 1,

View File

@ -1291,6 +1291,11 @@ class ARIABridge:
if not audio_b64:
logger.warning("[rvs] Audio ohne Daten empfangen")
return
# Voice-Override fuer die kommende ARIA-Antwort (App-lokal gewaehlt)
voice_override = payload.get("voice", "")
if voice_override:
self._next_voice_override = voice_override
logger.info("[rvs] Voice-Override (via Audio): %s", voice_override)
logger.info("[rvs] Audio empfangen: %s, %dms, %dKB",
mime_type, duration_ms, len(audio_b64) // 1365)
asyncio.create_task(self._process_app_audio(audio_b64, mime_type))