Compare commits
24 Commits
b373f915b5
...
v0.0.6.2
| Author | SHA1 | Date | |
|---|---|---|---|
| 745b4a07c0 | |||
| 23ca815cb2 | |||
| cc3fac8142 | |||
| cd89e36ec2 | |||
| f5b4285d15 | |||
| 248e7c9ae4 | |||
| 7058cc8d8d | |||
| 7919489543 | |||
| feac7f2479 | |||
| b80b813703 | |||
| e7bb6c37cb | |||
| d146ca92c4 | |||
| fd95af2c40 | |||
| 9e12e0001c | |||
| 1d34143be5 | |||
| 0fc11e33c8 | |||
| dae603541b | |||
| 87b4cd305c | |||
| 190352820c | |||
| 2264f4e3bc | |||
| 58fd8721e3 | |||
| 4f494daffb | |||
| 958c8d6fc6 | |||
| 5ba89c7191 |
@@ -650,6 +650,33 @@ In der Diagnostic unter Einstellungen → Sprachausgabe:
|
|||||||
> **Tipp:** Fuer beste Ergebnisse: saubere Aufnahme, eine Stimme, kein Hintergrund,
|
> **Tipp:** Fuer beste Ergebnisse: saubere Aufnahme, eine Stimme, kein Hintergrund,
|
||||||
> 10-30 Sekunden Gesamtlaenge. Mehrere kurze Dateien werden zusammengefuegt.
|
> 10-30 Sekunden Gesamtlaenge. Mehrere kurze Dateien werden zusammengefuegt.
|
||||||
|
|
||||||
|
### Deutsches Fine-Tune (bessere Qualitaet auf Deutsch)
|
||||||
|
|
||||||
|
Das Default-Modell `F5TTS_v1_Base` ist primaer auf Englisch + Chinesisch trainiert
|
||||||
|
und liefert auf Deutsch merklich schwaechere Voice-Cloning-Qualitaet als XTTS es
|
||||||
|
tat. Community-Fine-Tune von [aihpi](https://huggingface.co/aihpi/F5-TTS-German)
|
||||||
|
auf dem Emilia-Dataset + Common Voice 19.0 funktioniert deutlich besser.
|
||||||
|
|
||||||
|
**Konfiguration ueber Diagnostic → "F5-TTS Modell-Tuning (advanced)":**
|
||||||
|
|
||||||
|
| Feld | Wert |
|
||||||
|
|------|------|
|
||||||
|
| Modell-Architektur | `F5TTS_Base` *(nicht v1_Base! Fine-Tune basiert auf der alten Architektur)* |
|
||||||
|
| Custom Checkpoint | `hf://aihpi/F5-TTS-German/F5TTS_Base/model_365000.safetensors` |
|
||||||
|
| Custom Vocab | `hf://aihpi/F5-TTS-German/vocab.txt` |
|
||||||
|
| cfg_strength | `2.0` |
|
||||||
|
| nfe_step | `32` |
|
||||||
|
|
||||||
|
→ "Anwenden" klicken. Die `hf://`-Pfade werden einmalig automatisch runter-
|
||||||
|
geladen (~3-5GB, landet im `xtts/hf-cache/`) und bei Container-Restart aus
|
||||||
|
dem Cache wiederverwendet.
|
||||||
|
|
||||||
|
> **Warnung zur BigVGAN-Variante** (`F5TTS_Base_bigvgan/model_295000.safetensors`):
|
||||||
|
> funktioniert AKTUELL NICHT mit dieser Bridge. Die f5-tts Library laedt
|
||||||
|
> per Default den Vocos-Vocoder, die BigVGAN-Weights sind damit inkompatibel
|
||||||
|
> → Modell produziert NaN, App bleibt stumm. Nur die **Vocos-Variante
|
||||||
|
> (F5TTS_Base/model_365000.safetensors)** nutzen.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Docker Volumes
|
## Docker Volumes
|
||||||
|
|||||||
@@ -79,8 +79,8 @@ android {
|
|||||||
applicationId "com.ariacockpit"
|
applicationId "com.ariacockpit"
|
||||||
minSdkVersion rootProject.ext.minSdkVersion
|
minSdkVersion rootProject.ext.minSdkVersion
|
||||||
targetSdkVersion rootProject.ext.targetSdkVersion
|
targetSdkVersion rootProject.ext.targetSdkVersion
|
||||||
versionCode 506
|
versionCode 602
|
||||||
versionName "0.0.5.6"
|
versionName "0.0.6.2"
|
||||||
// Fallback fuer Libraries mit Product Flavors
|
// Fallback fuer Libraries mit Product Flavors
|
||||||
missingDimensionStrategy 'react-native-camera', 'general'
|
missingDimensionStrategy 'react-native-camera', 'general'
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,11 +32,17 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
|||||||
private const val TAG = "PcmStreamPlayer"
|
private const val TAG = "PcmStreamPlayer"
|
||||||
// Fallback wenn JS keinen Wert uebergibt.
|
// Fallback wenn JS keinen Wert uebergibt.
|
||||||
private const val DEFAULT_PREROLL_SECONDS = 3.5
|
private const val DEFAULT_PREROLL_SECONDS = 3.5
|
||||||
private const val MIN_PREROLL_SECONDS = 0.5
|
// 0.0 = sofortige Wiedergabe — play() direkt beim ersten Chunk.
|
||||||
|
// Macht Sinn fuer F5-TTS weil Render so schnell ist dass ein Puffer
|
||||||
|
// unnoetig ist und bei kurzen Saetzen sogar stoeren kann.
|
||||||
|
private const val MIN_PREROLL_SECONDS = 0.0
|
||||||
private const val MAX_PREROLL_SECONDS = 10.0
|
private const val MAX_PREROLL_SECONDS = 10.0
|
||||||
// Stille am Stream-Anfang, damit AudioTrack sauber anfaehrt und die
|
// Stille am Stream-Anfang, damit AudioTrack sauber anfaehrt und die
|
||||||
// ersten Samples nicht abgeschnitten werden (XTTS-Warmup + play()-Latenz).
|
// ersten Samples nicht abgeschnitten werden (XTTS-Warmup + play()-Latenz).
|
||||||
private const val LEADING_SILENCE_SECONDS = 0.2
|
private const val LEADING_SILENCE_SECONDS = 0.3
|
||||||
|
// Stille am Ende — puffert das Hardware-Flushen damit die letzten
|
||||||
|
// echten Samples garantiert ausgespielt werden bevor stop() kommt.
|
||||||
|
private const val TRAILING_SILENCE_SECONDS = 0.3
|
||||||
}
|
}
|
||||||
|
|
||||||
override fun getName() = "PcmStreamPlayer"
|
override fun getName() = "PcmStreamPlayer"
|
||||||
@@ -59,9 +65,12 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
|||||||
// Alte Session beenden falls vorhanden
|
// Alte Session beenden falls vorhanden
|
||||||
stopInternal()
|
stopInternal()
|
||||||
|
|
||||||
val prerollSec = prerollSeconds
|
// Nur NaN/Inf → Default. 0.0 ist gueltig (= sofortige Wiedergabe).
|
||||||
.coerceIn(MIN_PREROLL_SECONDS, MAX_PREROLL_SECONDS)
|
val prerollSec = if (prerollSeconds.isFinite() && prerollSeconds >= 0.0) {
|
||||||
.let { if (it.isFinite() && it > 0) it else DEFAULT_PREROLL_SECONDS }
|
prerollSeconds.coerceIn(MIN_PREROLL_SECONDS, MAX_PREROLL_SECONDS)
|
||||||
|
} else {
|
||||||
|
DEFAULT_PREROLL_SECONDS
|
||||||
|
}
|
||||||
|
|
||||||
val channelConfig = if (channels == 2) AudioFormat.CHANNEL_OUT_STEREO else AudioFormat.CHANNEL_OUT_MONO
|
val channelConfig = if (channels == 2) AudioFormat.CHANNEL_OUT_STEREO else AudioFormat.CHANNEL_OUT_MONO
|
||||||
val encoding = AudioFormat.ENCODING_PCM_16BIT
|
val encoding = AudioFormat.ENCODING_PCM_16BIT
|
||||||
@@ -103,9 +112,9 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
|||||||
val t = track ?: return@Thread
|
val t = track ?: return@Thread
|
||||||
try {
|
try {
|
||||||
// Leading-Silence in den Buffer — gibt AudioTrack Zeit anzufahren.
|
// Leading-Silence in den Buffer — gibt AudioTrack Zeit anzufahren.
|
||||||
val silenceBytes = ((sampleRate * channels * 2) * LEADING_SILENCE_SECONDS).toInt() and 0x7FFFFFFE
|
val leadingBytes = ((sampleRate * channels * 2) * LEADING_SILENCE_SECONDS).toInt() and 0x7FFFFFFE
|
||||||
if (silenceBytes > 0) {
|
if (leadingBytes > 0) {
|
||||||
val silence = ByteArray(silenceBytes)
|
val silence = ByteArray(leadingBytes)
|
||||||
var silOff = 0
|
var silOff = 0
|
||||||
while (silOff < silence.size && !writerShouldStop) {
|
while (silOff < silence.size && !writerShouldStop) {
|
||||||
val w = t.write(silence, silOff, silence.size - silOff)
|
val w = t.write(silence, silOff, silence.size - silOff)
|
||||||
@@ -114,18 +123,38 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
|||||||
}
|
}
|
||||||
bytesBuffered += silence.size
|
bytesBuffered += silence.size
|
||||||
}
|
}
|
||||||
while (!writerShouldStop) {
|
// Bei preroll=0: play() SOFORT nach Leading-Silence aufrufen,
|
||||||
val data = queue.poll(50, java.util.concurrent.TimeUnit.MILLISECONDS) ?: run {
|
// nicht erst bei Ankunft des ersten echten Chunks. Android's
|
||||||
|
// AudioTrack haelt den Play-State und wartet auf neue Samples.
|
||||||
|
// So verschluckt es keine Worte wenn der erste Chunk erst
|
||||||
|
// nach play()-Startup-Latenz eintrifft.
|
||||||
|
if (prerollBytes == 0 && !playbackStarted) {
|
||||||
|
try {
|
||||||
|
t.play()
|
||||||
|
playbackStarted = true
|
||||||
|
Log.i(TAG, "Playback sofort gestartet (preroll=0, ${bytesBuffered}B silence)")
|
||||||
|
} catch (e: Exception) {
|
||||||
|
Log.w(TAG, "play() sofort failed: ${e.message}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mainLoop@ while (!writerShouldStop) {
|
||||||
|
val data = queue.poll(50, java.util.concurrent.TimeUnit.MILLISECONDS)
|
||||||
|
if (data == null) {
|
||||||
if (endRequested) {
|
if (endRequested) {
|
||||||
// Falls wir vor Pre-Roll enden (kurzer Text): trotzdem abspielen
|
// Falls wir vor Pre-Roll enden (kurzer Text): trotzdem abspielen
|
||||||
if (!playbackStarted) {
|
if (!playbackStarted) {
|
||||||
try { t.play() } catch (_: Exception) {}
|
try {
|
||||||
playbackStarted = true
|
t.play()
|
||||||
|
playbackStarted = true
|
||||||
|
Log.i(TAG, "Playback gestartet VOR Pre-Roll (kurzer Text, ${bytesBuffered}B gepuffert)")
|
||||||
|
} catch (e: Exception) {
|
||||||
|
Log.w(TAG, "play() fallback failed: ${e.message}")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return@Thread
|
break@mainLoop
|
||||||
}
|
}
|
||||||
null
|
continue@mainLoop
|
||||||
} ?: continue
|
}
|
||||||
|
|
||||||
// Pre-Roll Check: play() erst wenn genug gepuffert
|
// Pre-Roll Check: play() erst wenn genug gepuffert
|
||||||
if (!playbackStarted && bytesBuffered + data.size >= prerollBytes) {
|
if (!playbackStarted && bytesBuffered + data.size >= prerollBytes) {
|
||||||
@@ -146,6 +175,19 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
|||||||
}
|
}
|
||||||
bytesBuffered += data.size
|
bytesBuffered += data.size
|
||||||
}
|
}
|
||||||
|
// Trailing-Silence damit die letzten echten Samples garantiert
|
||||||
|
// durch das Hardware-Buffering kommen bevor stop() sie abschneidet
|
||||||
|
val trailingBytes = ((sampleRate * channels * 2) * TRAILING_SILENCE_SECONDS).toInt() and 0x7FFFFFFE
|
||||||
|
if (trailingBytes > 0 && !writerShouldStop) {
|
||||||
|
val silence = ByteArray(trailingBytes)
|
||||||
|
var silOff = 0
|
||||||
|
while (silOff < silence.size && !writerShouldStop) {
|
||||||
|
val w = t.write(silence, silOff, silence.size - silOff)
|
||||||
|
if (w <= 0) break
|
||||||
|
silOff += w
|
||||||
|
}
|
||||||
|
bytesBuffered += silence.size
|
||||||
|
}
|
||||||
} catch (e: Exception) {
|
} catch (e: Exception) {
|
||||||
Log.w(TAG, "Writer-Thread Fehler: ${e.message}")
|
Log.w(TAG, "Writer-Thread Fehler: ${e.message}")
|
||||||
} finally {
|
} finally {
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "aria-cockpit",
|
"name": "aria-cockpit",
|
||||||
"version": "0.0.5.6",
|
"version": "0.0.6.2",
|
||||||
"private": true,
|
"private": true,
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"android": "react-native run-android",
|
"android": "react-native run-android",
|
||||||
|
|||||||
@@ -0,0 +1,90 @@
|
|||||||
|
/**
|
||||||
|
* MessageText — rendert Chat-Text mit Auto-Linkifizierung:
|
||||||
|
* - http(s)://... → tippbar, oeffnet im Browser
|
||||||
|
* - mailto: oder plain E-Mail → tippbar, oeffnet Mail-App
|
||||||
|
* - Telefonnummern → tippbar, oeffnet Android-Dialer
|
||||||
|
*
|
||||||
|
* Text ist durchgaengig markierbar/kopierbar (selectable).
|
||||||
|
*/
|
||||||
|
|
||||||
|
import React from 'react';
|
||||||
|
import { Text, Linking, TextStyle, StyleProp } from 'react-native';
|
||||||
|
|
||||||
|
// Regex kombiniert URL | Email | Telefonnummer.
|
||||||
|
// Gruppenreihenfolge ist wichtig fuer die Erkennung unten.
|
||||||
|
//
|
||||||
|
// URL: http://... oder https://... bis zum ersten Whitespace / Anfuehrungszeichen.
|
||||||
|
// Email: simpler Standard-Match (kein RFC-kompatibel aber gut genug).
|
||||||
|
// Telefon: internationale Form (+49..., 0049..., 0176...), darf Leerzeichen
|
||||||
|
// / Bindestriche / Schraegstriche / Klammern enthalten, mindestens 7
|
||||||
|
// Ziffern insgesamt. Vermeidet banale Zahlen (Uhrzeiten, Datum).
|
||||||
|
const LINK_REGEX = new RegExp(
|
||||||
|
'(https?:\\/\\/[^\\s<>"]+)' + // 1: URL
|
||||||
|
'|([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,})' + // 2: Email
|
||||||
|
'|((?:\\+|00)\\d[\\d\\s()\\-\\/]{6,}\\d|0\\d{2,4}[\\s\\/\\-]?[\\d\\s\\-\\/]{5,}\\d)', // 3: Telefon
|
||||||
|
'g',
|
||||||
|
);
|
||||||
|
|
||||||
|
const LINK_STYLE = { color: '#0096FF', textDecorationLine: 'underline' } as TextStyle;
|
||||||
|
|
||||||
|
interface Segment {
|
||||||
|
text: string;
|
||||||
|
kind: 'text' | 'url' | 'email' | 'phone';
|
||||||
|
}
|
||||||
|
|
||||||
|
function tokenize(raw: string): Segment[] {
|
||||||
|
const out: Segment[] = [];
|
||||||
|
let lastEnd = 0;
|
||||||
|
LINK_REGEX.lastIndex = 0;
|
||||||
|
let m: RegExpExecArray | null;
|
||||||
|
while ((m = LINK_REGEX.exec(raw)) !== null) {
|
||||||
|
if (m.index > lastEnd) {
|
||||||
|
out.push({ text: raw.slice(lastEnd, m.index), kind: 'text' });
|
||||||
|
}
|
||||||
|
if (m[1]) out.push({ text: m[1], kind: 'url' });
|
||||||
|
else if (m[2]) out.push({ text: m[2], kind: 'email' });
|
||||||
|
else if (m[3]) out.push({ text: m[3], kind: 'phone' });
|
||||||
|
lastEnd = LINK_REGEX.lastIndex;
|
||||||
|
}
|
||||||
|
if (lastEnd < raw.length) out.push({ text: raw.slice(lastEnd), kind: 'text' });
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
function onPress(seg: Segment) {
|
||||||
|
try {
|
||||||
|
if (seg.kind === 'url') {
|
||||||
|
Linking.openURL(seg.text);
|
||||||
|
} else if (seg.kind === 'email') {
|
||||||
|
Linking.openURL(`mailto:${seg.text}`);
|
||||||
|
} else if (seg.kind === 'phone') {
|
||||||
|
// Android-Dialer erwartet tel:-Schema ohne Leerzeichen/Bindestriche
|
||||||
|
const clean = seg.text.replace(/[\s\-\/()]/g, '');
|
||||||
|
Linking.openURL(`tel:${clean}`);
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
|
||||||
|
interface Props {
|
||||||
|
text: string;
|
||||||
|
style?: StyleProp<TextStyle>;
|
||||||
|
}
|
||||||
|
|
||||||
|
const MessageText: React.FC<Props> = ({ text, style }) => {
|
||||||
|
const segments = React.useMemo(() => tokenize(text), [text]);
|
||||||
|
return (
|
||||||
|
<Text style={style} selectable>
|
||||||
|
{segments.map((seg, i) => {
|
||||||
|
if (seg.kind === 'text') {
|
||||||
|
return <Text key={i}>{seg.text}</Text>;
|
||||||
|
}
|
||||||
|
return (
|
||||||
|
<Text key={i} style={LINK_STYLE} onPress={() => onPress(seg)}>
|
||||||
|
{seg.text}
|
||||||
|
</Text>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</Text>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
export default MessageText;
|
||||||
@@ -93,18 +93,24 @@ const VoiceButton: React.FC<VoiceButtonProps> = ({
|
|||||||
}
|
}
|
||||||
}, [isRecording]);
|
}, [isRecording]);
|
||||||
|
|
||||||
// VAD Silence Callback — Auto-Stop
|
// VAD Silence Callback — Auto-Stop.
|
||||||
|
// WICHTIG: NICHT auf isRecording prüfen (Closure ist stale) — stattdessen
|
||||||
|
// audioService selber fragen. Empty deps → Listener wird EINMAL registriert.
|
||||||
|
// audioService garantiert jetzt dass der Callback pro Aufnahme nur einmal
|
||||||
|
// feuert (silenceFired-Latch).
|
||||||
|
const onCompleteRef = useRef(onRecordingComplete);
|
||||||
|
useEffect(() => { onCompleteRef.current = onRecordingComplete; }, [onRecordingComplete]);
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
const unsubSilence = audioService.onSilenceDetected(async () => {
|
const unsubSilence = audioService.onSilenceDetected(async () => {
|
||||||
if (!isRecording) return;
|
if (audioService.getRecordingState() !== 'recording') return;
|
||||||
setIsRecording(false);
|
|
||||||
const result = await audioService.stopRecording();
|
const result = await audioService.stopRecording();
|
||||||
|
setIsRecording(false);
|
||||||
if (result && result.durationMs > 500) {
|
if (result && result.durationMs > 500) {
|
||||||
onRecordingComplete(result);
|
onCompleteRef.current(result);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return unsubSilence;
|
return unsubSilence;
|
||||||
}, [isRecording, onRecordingComplete]);
|
}, []);
|
||||||
|
|
||||||
// Auto-Start fuer Wake Word (extern getriggert)
|
// Auto-Start fuer Wake Word (extern getriggert)
|
||||||
const startAutoRecording = useCallback(async () => {
|
const startAutoRecording = useCallback(async () => {
|
||||||
@@ -136,23 +142,35 @@ const VoiceButton: React.FC<VoiceButtonProps> = ({
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Tap-to-Talk: Einmal tippen startet mit Auto-Stop
|
// Tap-to-Talk: Einmal tippen startet mit Auto-Stop.
|
||||||
|
// Guard gegen Doppel-Tap während asyncer Start/Stop.
|
||||||
|
const tapBusy = useRef(false);
|
||||||
const handleTap = async () => {
|
const handleTap = async () => {
|
||||||
if (disabled) return;
|
if (disabled || tapBusy.current) return;
|
||||||
if (isRecording) {
|
tapBusy.current = true;
|
||||||
// Aufnahme manuell stoppen
|
try {
|
||||||
setIsRecording(false);
|
// Fragen WIR den Service, nicht den React-State (Closure kann stale sein)
|
||||||
const result = await audioService.stopRecording();
|
const svcState = audioService.getRecordingState();
|
||||||
if (result && result.durationMs > 300) {
|
if (svcState === 'recording') {
|
||||||
onRecordingComplete(result);
|
// Aufnahme manuell stoppen
|
||||||
}
|
const result = await audioService.stopRecording();
|
||||||
} else {
|
setIsRecording(false);
|
||||||
// Aufnahme mit Auto-Stop starten
|
if (result && result.durationMs > 300) {
|
||||||
const started = await audioService.startRecording(true);
|
onRecordingComplete(result);
|
||||||
if (started) {
|
}
|
||||||
isLongPress.current = false;
|
} else if (svcState === 'idle') {
|
||||||
setIsRecording(true);
|
// Aufnahme mit Auto-Stop starten
|
||||||
|
const started = await audioService.startRecording(true);
|
||||||
|
if (started) {
|
||||||
|
isLongPress.current = false;
|
||||||
|
setIsRecording(true);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
// svcState === 'processing': Stopp in progress — nichts tun, User
|
||||||
|
// muss nochmal tippen wenn fertig. Aber wir blockieren mit tapBusy
|
||||||
|
// kurz damit der User's UI-Feedback synchron bleibt.
|
||||||
|
} finally {
|
||||||
|
tapBusy.current = false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,8 @@ import updateService from '../services/updater';
|
|||||||
import VoiceButton from '../components/VoiceButton';
|
import VoiceButton from '../components/VoiceButton';
|
||||||
import FileUpload, { FileData } from '../components/FileUpload';
|
import FileUpload, { FileData } from '../components/FileUpload';
|
||||||
import CameraUpload, { PhotoData } from '../components/CameraUpload';
|
import CameraUpload, { PhotoData } from '../components/CameraUpload';
|
||||||
import { RecordingResult, loadConvWindowMs } from '../services/audio';
|
import MessageText from '../components/MessageText';
|
||||||
|
import { RecordingResult, loadConvWindowMs, loadTtsSpeed, TTS_SPEED_DEFAULT } from '../services/audio';
|
||||||
import Geolocation from '@react-native-community/geolocation';
|
import Geolocation from '@react-native-community/geolocation';
|
||||||
|
|
||||||
// --- Typen ---
|
// --- Typen ---
|
||||||
@@ -116,6 +117,13 @@ const ChatScreen: React.FC = () => {
|
|||||||
const [ttsMuted, setTtsMuted] = useState(false);
|
const [ttsMuted, setTtsMuted] = useState(false);
|
||||||
// Gerätelokale XTTS-Voice-Wahl (bevorzugt gegenueber dem globalen Default)
|
// Gerätelokale XTTS-Voice-Wahl (bevorzugt gegenueber dem globalen Default)
|
||||||
const localXttsVoiceRef = useRef<string>('');
|
const localXttsVoiceRef = useRef<string>('');
|
||||||
|
// Geraetelokale TTS-Wiedergabegeschwindigkeit (speed-Param an F5-TTS)
|
||||||
|
const ttsSpeedRef = useRef<number>(TTS_SPEED_DEFAULT);
|
||||||
|
// Spiegelung der TTS-Settings in einer Ref — damit die onMessage-Closure
|
||||||
|
// (useEffect mit []-deps) IMMER die aktuellen Werte sieht. Ohne Ref
|
||||||
|
// bliebe canPlay auf dem Mount-Initial-Wert haengen (mute ignoriert,
|
||||||
|
// oder AsyncStorage-Load nicht beruecksichtigt).
|
||||||
|
const ttsCanPlayRef = useRef<boolean>(true);
|
||||||
|
|
||||||
const flatListRef = useRef<FlatList>(null);
|
const flatListRef = useRef<FlatList>(null);
|
||||||
const messageIdCounter = useRef(0);
|
const messageIdCounter = useRef(0);
|
||||||
@@ -135,6 +143,7 @@ const ChatScreen: React.FC = () => {
|
|||||||
setTtsMuted(muted === 'true'); // default false
|
setTtsMuted(muted === 'true'); // default false
|
||||||
const voice = await AsyncStorage.getItem('aria_xtts_voice');
|
const voice = await AsyncStorage.getItem('aria_xtts_voice');
|
||||||
localXttsVoiceRef.current = voice || '';
|
localXttsVoiceRef.current = voice || '';
|
||||||
|
ttsSpeedRef.current = await loadTtsSpeed();
|
||||||
};
|
};
|
||||||
loadTtsSettings();
|
loadTtsSettings();
|
||||||
// Poll alle 2s um Settings-Aenderung mitzubekommen (einfache Loesung ohne Context)
|
// Poll alle 2s um Settings-Aenderung mitzubekommen (einfache Loesung ohne Context)
|
||||||
@@ -147,6 +156,12 @@ const ChatScreen: React.FC = () => {
|
|||||||
wakeWordService.loadFromStorage().catch(() => {});
|
wakeWordService.loadFromStorage().catch(() => {});
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
|
// ttsCanPlayRef live aktuell halten — Closure in onMessage unten liest
|
||||||
|
// darueber statt direkt ttsDeviceEnabled/ttsMuted (sonst stale).
|
||||||
|
useEffect(() => {
|
||||||
|
ttsCanPlayRef.current = ttsDeviceEnabled && !ttsMuted;
|
||||||
|
}, [ttsDeviceEnabled, ttsMuted]);
|
||||||
|
|
||||||
const toggleMute = useCallback(() => {
|
const toggleMute = useCallback(() => {
|
||||||
setTtsMuted(prev => {
|
setTtsMuted(prev => {
|
||||||
const next = !prev;
|
const next = !prev;
|
||||||
@@ -299,7 +314,12 @@ const ChatScreen: React.FC = () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TTS-Audio abspielen wenn vorhanden — respektiert geraetelokalen Mute/Disable
|
// TTS-Audio abspielen wenn vorhanden — respektiert geraetelokalen Mute/Disable
|
||||||
const canPlay = ttsDeviceEnabled && !ttsMuted;
|
// WICHTIG: via Ref statt direkt state lesen, sonst ist's stale (Closure-Bug).
|
||||||
|
const canPlay = ttsCanPlayRef.current;
|
||||||
|
if (message.type === 'audio_pcm' || (message.type === 'audio' && message.payload.base64)) {
|
||||||
|
console.log('[Chat] audio-msg canPlay=%s (enabled=%s muted=%s)',
|
||||||
|
canPlay, ttsDeviceEnabled, ttsMuted);
|
||||||
|
}
|
||||||
if (message.type === 'audio' && message.payload.base64) {
|
if (message.type === 'audio' && message.payload.base64) {
|
||||||
const b64 = message.payload.base64 as string;
|
const b64 = message.payload.base64 as string;
|
||||||
const refId = (message.payload.messageId as string) || '';
|
const refId = (message.payload.messageId as string) || '';
|
||||||
@@ -439,6 +459,7 @@ const ChatScreen: React.FC = () => {
|
|||||||
durationMs: result.durationMs,
|
durationMs: result.durationMs,
|
||||||
mimeType: result.mimeType,
|
mimeType: result.mimeType,
|
||||||
voice: localXttsVoiceRef.current,
|
voice: localXttsVoiceRef.current,
|
||||||
|
speed: ttsSpeedRef.current,
|
||||||
...(location && { location }),
|
...(location && { location }),
|
||||||
});
|
});
|
||||||
// resume() wird durch onPlaybackFinished nach ARIAs Antwort getriggert.
|
// resume() wird durch onPlaybackFinished nach ARIAs Antwort getriggert.
|
||||||
@@ -460,7 +481,12 @@ const ChatScreen: React.FC = () => {
|
|||||||
// Wake Word Toggle Handler
|
// Wake Word Toggle Handler
|
||||||
const toggleWakeWord = useCallback(async () => {
|
const toggleWakeWord = useCallback(async () => {
|
||||||
if (wakeWordActive) {
|
if (wakeWordActive) {
|
||||||
wakeWordService.stop();
|
// Vor Porcupine-Stop: eventuelle laufende Aufnahme abbrechen. Sonst
|
||||||
|
// bleibt audioService.recordingState=='recording' haengen und der
|
||||||
|
// normale Aufnahme-Button wirkt nicht mehr (startRecording lehnt
|
||||||
|
// ab weil "Aufnahme laeuft bereits").
|
||||||
|
try { await audioService.stopRecording(); } catch {}
|
||||||
|
await wakeWordService.stop();
|
||||||
setWakeWordActive(false);
|
setWakeWordActive(false);
|
||||||
} else {
|
} else {
|
||||||
const started = await wakeWordService.start();
|
const started = await wakeWordService.start();
|
||||||
@@ -550,6 +576,7 @@ const ChatScreen: React.FC = () => {
|
|||||||
rvs.send('chat', {
|
rvs.send('chat', {
|
||||||
text,
|
text,
|
||||||
voice: localXttsVoiceRef.current,
|
voice: localXttsVoiceRef.current,
|
||||||
|
speed: ttsSpeedRef.current,
|
||||||
...(location && { location }),
|
...(location && { location }),
|
||||||
});
|
});
|
||||||
}, [inputText, getCurrentLocation, pendingAttachments, sendPendingAttachments]);
|
}, [inputText, getCurrentLocation, pendingAttachments, sendPendingAttachments]);
|
||||||
@@ -659,6 +686,7 @@ const ChatScreen: React.FC = () => {
|
|||||||
rvs.send('chat', {
|
rvs.send('chat', {
|
||||||
text: messageText,
|
text: messageText,
|
||||||
voice: localXttsVoiceRef.current,
|
voice: localXttsVoiceRef.current,
|
||||||
|
speed: ttsSpeedRef.current,
|
||||||
...(location && { location }),
|
...(location && { location }),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -733,9 +761,10 @@ const ChatScreen: React.FC = () => {
|
|||||||
))}
|
))}
|
||||||
{/* Text (nicht anzeigen wenn nur "Anhang empfangen" und ein Bild da ist) */}
|
{/* Text (nicht anzeigen wenn nur "Anhang empfangen" und ein Bild da ist) */}
|
||||||
{!(item.text === 'Anhang empfangen' && item.attachments?.some(a => a.type === 'image' && a.uri)) && (
|
{!(item.text === 'Anhang empfangen' && item.attachments?.some(a => a.type === 'image' && a.uri)) && (
|
||||||
<Text style={[styles.messageText, isUser ? styles.userText : styles.ariaText]}>
|
<MessageText
|
||||||
{item.text}
|
text={item.text}
|
||||||
</Text>
|
style={[styles.messageText, isUser ? styles.userText : styles.ariaText]}
|
||||||
|
/>
|
||||||
)}
|
)}
|
||||||
{/* Play-Button fuer ARIA-Nachrichten — Cache bevorzugt, sonst Bridge-TTS mit aktueller Engine */}
|
{/* Play-Button fuer ARIA-Nachrichten — Cache bevorzugt, sonst Bridge-TTS mit aktueller Engine */}
|
||||||
{!isUser && item.text.length > 0 && (
|
{!isUser && item.text.length > 0 && (
|
||||||
@@ -750,6 +779,7 @@ const ChatScreen: React.FC = () => {
|
|||||||
rvs.send('tts_request' as any, {
|
rvs.send('tts_request' as any, {
|
||||||
text: item.text,
|
text: item.text,
|
||||||
voice: localXttsVoiceRef.current,
|
voice: localXttsVoiceRef.current,
|
||||||
|
speed: ttsSpeedRef.current,
|
||||||
messageId: item.messageId || '',
|
messageId: item.messageId || '',
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,6 +35,10 @@ import {
|
|||||||
CONV_WINDOW_MIN_SEC,
|
CONV_WINDOW_MIN_SEC,
|
||||||
CONV_WINDOW_MAX_SEC,
|
CONV_WINDOW_MAX_SEC,
|
||||||
CONV_WINDOW_STORAGE_KEY,
|
CONV_WINDOW_STORAGE_KEY,
|
||||||
|
TTS_SPEED_DEFAULT,
|
||||||
|
TTS_SPEED_MIN,
|
||||||
|
TTS_SPEED_MAX,
|
||||||
|
TTS_SPEED_STORAGE_KEY,
|
||||||
} from '../services/audio';
|
} from '../services/audio';
|
||||||
import wakeWordService, {
|
import wakeWordService, {
|
||||||
BUILTIN_KEYWORDS,
|
BUILTIN_KEYWORDS,
|
||||||
@@ -98,6 +102,7 @@ const SettingsScreen: React.FC = () => {
|
|||||||
const [ttsPrerollSec, setTtsPrerollSec] = useState<number>(TTS_PREROLL_DEFAULT_SEC);
|
const [ttsPrerollSec, setTtsPrerollSec] = useState<number>(TTS_PREROLL_DEFAULT_SEC);
|
||||||
const [vadSilenceSec, setVadSilenceSec] = useState<number>(VAD_SILENCE_DEFAULT_SEC);
|
const [vadSilenceSec, setVadSilenceSec] = useState<number>(VAD_SILENCE_DEFAULT_SEC);
|
||||||
const [convWindowSec, setConvWindowSec] = useState<number>(CONV_WINDOW_DEFAULT_SEC);
|
const [convWindowSec, setConvWindowSec] = useState<number>(CONV_WINDOW_DEFAULT_SEC);
|
||||||
|
const [ttsSpeed, setTtsSpeed] = useState<number>(TTS_SPEED_DEFAULT);
|
||||||
const [wakeAccessKey, setWakeAccessKey] = useState<string>('');
|
const [wakeAccessKey, setWakeAccessKey] = useState<string>('');
|
||||||
const [wakeAccessKeyVisible, setWakeAccessKeyVisible] = useState(false);
|
const [wakeAccessKeyVisible, setWakeAccessKeyVisible] = useState(false);
|
||||||
const [wakeKeyword, setWakeKeyword] = useState<string>(DEFAULT_KEYWORD);
|
const [wakeKeyword, setWakeKeyword] = useState<string>(DEFAULT_KEYWORD);
|
||||||
@@ -153,6 +158,12 @@ const SettingsScreen: React.FC = () => {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
AsyncStorage.getItem(TTS_SPEED_STORAGE_KEY).then(saved => {
|
||||||
|
if (saved != null) {
|
||||||
|
const n = parseFloat(saved);
|
||||||
|
if (isFinite(n) && n >= TTS_SPEED_MIN && n <= TTS_SPEED_MAX) setTtsSpeed(n);
|
||||||
|
}
|
||||||
|
});
|
||||||
AsyncStorage.getItem(WAKE_ACCESS_KEY_STORAGE).then(saved => {
|
AsyncStorage.getItem(WAKE_ACCESS_KEY_STORAGE).then(saved => {
|
||||||
if (saved) setWakeAccessKey(saved);
|
if (saved) setWakeAccessKey(saved);
|
||||||
});
|
});
|
||||||
@@ -800,6 +811,38 @@ const SettingsScreen: React.FC = () => {
|
|||||||
<Text style={styles.prerollButtonText}>+0.5</Text>
|
<Text style={styles.prerollButtonText}>+0.5</Text>
|
||||||
</TouchableOpacity>
|
</TouchableOpacity>
|
||||||
</View>
|
</View>
|
||||||
|
|
||||||
|
<Text style={[styles.toggleLabel, {marginTop: 24}]}>Sprechgeschwindigkeit</Text>
|
||||||
|
<Text style={styles.toggleHint}>
|
||||||
|
Wie schnell ARIA spricht. 1.0 = Normal. Niedriger = langsamer, hoeher = schneller.
|
||||||
|
Wird an F5-TTS als speed-Param uebergeben und pro Geraet gespeichert.
|
||||||
|
Default: {TTS_SPEED_DEFAULT.toFixed(1)}x.
|
||||||
|
</Text>
|
||||||
|
<View style={styles.prerollRow}>
|
||||||
|
<TouchableOpacity
|
||||||
|
style={styles.prerollButton}
|
||||||
|
onPress={() => {
|
||||||
|
const next = Math.max(TTS_SPEED_MIN, Math.round((ttsSpeed - 0.1) * 10) / 10);
|
||||||
|
setTtsSpeed(next);
|
||||||
|
AsyncStorage.setItem(TTS_SPEED_STORAGE_KEY, String(next));
|
||||||
|
}}
|
||||||
|
disabled={ttsSpeed <= TTS_SPEED_MIN}
|
||||||
|
>
|
||||||
|
<Text style={styles.prerollButtonText}>−0.1</Text>
|
||||||
|
</TouchableOpacity>
|
||||||
|
<Text style={styles.prerollValue}>{ttsSpeed.toFixed(1)} x</Text>
|
||||||
|
<TouchableOpacity
|
||||||
|
style={styles.prerollButton}
|
||||||
|
onPress={() => {
|
||||||
|
const next = Math.min(TTS_SPEED_MAX, Math.round((ttsSpeed + 0.1) * 10) / 10);
|
||||||
|
setTtsSpeed(next);
|
||||||
|
AsyncStorage.setItem(TTS_SPEED_STORAGE_KEY, String(next));
|
||||||
|
}}
|
||||||
|
disabled={ttsSpeed >= TTS_SPEED_MAX}
|
||||||
|
>
|
||||||
|
<Text style={styles.prerollButtonText}>+0.1</Text>
|
||||||
|
</TouchableOpacity>
|
||||||
|
</View>
|
||||||
</View>
|
</View>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
|||||||
@@ -92,6 +92,24 @@ export const CONV_WINDOW_MIN_SEC = 3.0;
|
|||||||
export const CONV_WINDOW_MAX_SEC = 20.0;
|
export const CONV_WINDOW_MAX_SEC = 20.0;
|
||||||
export const CONV_WINDOW_STORAGE_KEY = 'aria_conv_window_sec';
|
export const CONV_WINDOW_STORAGE_KEY = 'aria_conv_window_sec';
|
||||||
|
|
||||||
|
// TTS-Wiedergabegeschwindigkeit — wird pro Geraet gespeichert und an die
|
||||||
|
// Bridge mitgegeben (speed-Param im F5-TTS infer()). 1.0 = normal.
|
||||||
|
export const TTS_SPEED_DEFAULT = 1.0;
|
||||||
|
export const TTS_SPEED_MIN = 0.1;
|
||||||
|
export const TTS_SPEED_MAX = 5.0;
|
||||||
|
export const TTS_SPEED_STORAGE_KEY = 'aria_tts_speed';
|
||||||
|
|
||||||
|
export async function loadTtsSpeed(): Promise<number> {
|
||||||
|
try {
|
||||||
|
const raw = await AsyncStorage.getItem(TTS_SPEED_STORAGE_KEY);
|
||||||
|
if (raw != null) {
|
||||||
|
const n = parseFloat(raw);
|
||||||
|
if (isFinite(n) && n >= TTS_SPEED_MIN && n <= TTS_SPEED_MAX) return n;
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
return TTS_SPEED_DEFAULT;
|
||||||
|
}
|
||||||
|
|
||||||
export async function loadConvWindowMs(): Promise<number> {
|
export async function loadConvWindowMs(): Promise<number> {
|
||||||
try {
|
try {
|
||||||
const raw = await AsyncStorage.getItem(CONV_WINDOW_STORAGE_KEY);
|
const raw = await AsyncStorage.getItem(CONV_WINDOW_STORAGE_KEY);
|
||||||
@@ -125,7 +143,7 @@ const MAX_RECORDING_MS = 120000;
|
|||||||
// Pre-Roll: Wie lange Audio im AudioTrack-Buffer liegt bevor play() startet.
|
// Pre-Roll: Wie lange Audio im AudioTrack-Buffer liegt bevor play() startet.
|
||||||
// Einstellbar via Diagnostic/Settings (Key: aria_tts_preroll_sec).
|
// Einstellbar via Diagnostic/Settings (Key: aria_tts_preroll_sec).
|
||||||
export const TTS_PREROLL_DEFAULT_SEC = 3.5;
|
export const TTS_PREROLL_DEFAULT_SEC = 3.5;
|
||||||
export const TTS_PREROLL_MIN_SEC = 1.0;
|
export const TTS_PREROLL_MIN_SEC = 0; // 0 = sofort abspielen (F5-TTS ist schnell genug)
|
||||||
export const TTS_PREROLL_MAX_SEC = 6.0;
|
export const TTS_PREROLL_MAX_SEC = 6.0;
|
||||||
export const TTS_PREROLL_STORAGE_KEY = 'aria_tts_preroll_sec';
|
export const TTS_PREROLL_STORAGE_KEY = 'aria_tts_preroll_sec';
|
||||||
|
|
||||||
@@ -178,6 +196,8 @@ class AudioService {
|
|||||||
private lastSpeechTime: number = 0;
|
private lastSpeechTime: number = 0;
|
||||||
private vadTimer: ReturnType<typeof setInterval> | null = null;
|
private vadTimer: ReturnType<typeof setInterval> | null = null;
|
||||||
private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
|
private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
|
||||||
|
// Latch damit der Silence-Callback pro Aufnahme genau einmal feuert
|
||||||
|
private silenceFired: boolean = false;
|
||||||
private noSpeechTimer: ReturnType<typeof setTimeout> | null = null;
|
private noSpeechTimer: ReturnType<typeof setTimeout> | null = null;
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
@@ -287,33 +307,46 @@ class AudioService {
|
|||||||
// Andere Apps waehrend der Aufnahme pausieren (Musik, Videos etc.)
|
// Andere Apps waehrend der Aufnahme pausieren (Musik, Videos etc.)
|
||||||
AudioFocus?.requestExclusive().catch(() => {});
|
AudioFocus?.requestExclusive().catch(() => {});
|
||||||
|
|
||||||
// VAD aktivieren — Stille-Dauer aus AsyncStorage (Settings-konfigurierbar)
|
// VAD aktivieren — Stille-Dauer aus AsyncStorage (Settings-konfigurierbar).
|
||||||
|
// WICHTIG: jeder Trigger (VAD-Stille / Max-Dauer / No-Speech-Window)
|
||||||
|
// disable SOFORT den VAD-Flag und clear den Timer, BEVOR die Listener
|
||||||
|
// gefeuert werden. Sonst feuert das setInterval weiter alle 200ms und
|
||||||
|
// ruft stopRecording parallel auf → audio-recorder-player crasht.
|
||||||
this.vadEnabled = autoStop;
|
this.vadEnabled = autoStop;
|
||||||
|
this.silenceFired = false;
|
||||||
|
const fireSilenceOnce = (reason: string) => {
|
||||||
|
if (this.silenceFired) return;
|
||||||
|
this.silenceFired = true;
|
||||||
|
this.vadEnabled = false;
|
||||||
|
if (this.vadTimer) { clearInterval(this.vadTimer); this.vadTimer = null; }
|
||||||
|
if (this.maxDurationTimer) { clearTimeout(this.maxDurationTimer); this.maxDurationTimer = null; }
|
||||||
|
if (this.noSpeechTimer) { clearTimeout(this.noSpeechTimer); this.noSpeechTimer = null; }
|
||||||
|
console.log('[Audio] Silence-Fire: %s', reason);
|
||||||
|
this.silenceListeners.forEach(cb => {
|
||||||
|
try { cb(); } catch (e) { console.warn('[Audio] silence listener err:', e); }
|
||||||
|
});
|
||||||
|
};
|
||||||
if (autoStop) {
|
if (autoStop) {
|
||||||
const vadSilenceMs = await loadVadSilenceMs();
|
const vadSilenceMs = await loadVadSilenceMs();
|
||||||
console.log('[Audio] VAD-Stille:', vadSilenceMs, 'ms');
|
console.log('[Audio] VAD-Stille:', vadSilenceMs, 'ms');
|
||||||
this.vadTimer = setInterval(() => {
|
this.vadTimer = setInterval(() => {
|
||||||
const silenceDuration = Date.now() - this.lastSpeechTime;
|
const silenceDuration = Date.now() - this.lastSpeechTime;
|
||||||
if (silenceDuration >= vadSilenceMs) {
|
if (silenceDuration >= vadSilenceMs) {
|
||||||
console.log(`[Audio] VAD: ${silenceDuration}ms Stille — Auto-Stop`);
|
fireSilenceOnce(`VAD ${silenceDuration}ms Stille`);
|
||||||
this.silenceListeners.forEach(cb => cb());
|
|
||||||
}
|
}
|
||||||
}, 200);
|
}, 200);
|
||||||
// Notbremse: Nach MAX_RECORDING_MS zwangsweise stoppen
|
// Notbremse: Nach MAX_RECORDING_MS zwangsweise stoppen
|
||||||
this.maxDurationTimer = setTimeout(() => {
|
this.maxDurationTimer = setTimeout(() => {
|
||||||
console.warn(`[Audio] Max-Dauer ${MAX_RECORDING_MS}ms erreicht — Zwangs-Stop`);
|
fireSilenceOnce(`Max-Dauer ${MAX_RECORDING_MS}ms`);
|
||||||
this.silenceListeners.forEach(cb => cb());
|
|
||||||
}, MAX_RECORDING_MS);
|
}, MAX_RECORDING_MS);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Conversation-Window: Wenn der User innerhalb noSpeechTimeoutMs nicht
|
// Conversation-Window: Wenn der User innerhalb noSpeechTimeoutMs nicht
|
||||||
// anfaengt zu sprechen → Aufnahme abbrechen (Speech-Gate verwirft sie),
|
// anfaengt zu sprechen → Aufnahme abbrechen (Speech-Gate verwirft sie).
|
||||||
// ChatScreen erkennt das und beendet die Konversation.
|
|
||||||
if (noSpeechTimeoutMs > 0) {
|
if (noSpeechTimeoutMs > 0) {
|
||||||
this.noSpeechTimer = setTimeout(() => {
|
this.noSpeechTimer = setTimeout(() => {
|
||||||
if (!this.speechDetected && this.recordingState === 'recording') {
|
if (!this.speechDetected && this.recordingState === 'recording') {
|
||||||
console.log(`[Audio] Conversation-Window ${noSpeechTimeoutMs}ms ohne Sprache — Stop`);
|
fireSilenceOnce(`Conversation-Window ${noSpeechTimeoutMs}ms ohne Sprache`);
|
||||||
this.silenceListeners.forEach(cb => cb());
|
|
||||||
}
|
}
|
||||||
}, noSpeechTimeoutMs);
|
}, noSpeechTimeoutMs);
|
||||||
}
|
}
|
||||||
@@ -426,7 +459,13 @@ class AudioService {
|
|||||||
|
|
||||||
/** Einen PCM-Chunk aus einer audio_pcm Nachricht empfangen.
|
/** Einen PCM-Chunk aus einer audio_pcm Nachricht empfangen.
|
||||||
* silent=true → nur cachen, nicht abspielen (z.B. wenn TTS geraetelokal gemutet).
|
* silent=true → nur cachen, nicht abspielen (z.B. wenn TTS geraetelokal gemutet).
|
||||||
* Gibt bei final=true den Cache-Pfad zurueck (file://) oder '' wenn nicht gecached. */
|
* Gibt bei final=true den Cache-Pfad zurueck (file://) oder '' wenn nicht gecached.
|
||||||
|
*
|
||||||
|
* Wrapper serialisiert aufeinanderfolgende Chunk-Calls via Promise-Queue —
|
||||||
|
* sonst gabs bei kurzen Streams einen Race: final-Chunk konnte `end()` rufen
|
||||||
|
* BEVOR der vorherige `start()` im Native-Modul fertig war. Der Writer-
|
||||||
|
* Thread sah dann endRequested=true ohne jemals Chunks zu verarbeiten. */
|
||||||
|
private _pcmChunkQueue: Promise<any> = Promise.resolve();
|
||||||
async handlePcmChunk(payload: {
|
async handlePcmChunk(payload: {
|
||||||
base64: string;
|
base64: string;
|
||||||
sampleRate?: number;
|
sampleRate?: number;
|
||||||
@@ -435,12 +474,37 @@ class AudioService {
|
|||||||
chunk?: number;
|
chunk?: number;
|
||||||
final?: boolean;
|
final?: boolean;
|
||||||
silent?: boolean;
|
silent?: boolean;
|
||||||
|
}): Promise<string> {
|
||||||
|
const p = this._pcmChunkQueue.then(() => this._handlePcmChunkImpl(payload)).catch(err => {
|
||||||
|
console.warn('[Audio] handlePcmChunk queued err:', err);
|
||||||
|
return '';
|
||||||
|
});
|
||||||
|
// Chain only on the side effect — callers still get the per-call result
|
||||||
|
this._pcmChunkQueue = p;
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
private async _handlePcmChunkImpl(payload: {
|
||||||
|
base64: string;
|
||||||
|
sampleRate?: number;
|
||||||
|
channels?: number;
|
||||||
|
messageId?: string;
|
||||||
|
chunk?: number;
|
||||||
|
final?: boolean;
|
||||||
|
silent?: boolean;
|
||||||
}): Promise<string> {
|
}): Promise<string> {
|
||||||
const silent = !!payload.silent;
|
const silent = !!payload.silent;
|
||||||
if (!silent && !PcmStreamPlayer) {
|
if (!silent && !PcmStreamPlayer) {
|
||||||
console.warn('[Audio] PcmStreamPlayer Native Module nicht verfuegbar');
|
console.warn('[Audio] PcmStreamPlayer Native Module nicht verfuegbar');
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
// Debug-Log bei Chunk 0 eines neuen Streams — damit man im adb logcat
|
||||||
|
// sieht warum der Auto-Playback greift oder nicht.
|
||||||
|
if ((payload.chunk ?? 0) === 0 && !this.pcmStreamActive) {
|
||||||
|
console.log('[Audio] PCM-Stream start: silent=%s messageId=%s sr=%s ch=%s',
|
||||||
|
silent, payload.messageId || '(none)',
|
||||||
|
payload.sampleRate, payload.channels);
|
||||||
|
}
|
||||||
|
|
||||||
const messageId = payload.messageId || '';
|
const messageId = payload.messageId || '';
|
||||||
const sampleRate = payload.sampleRate || 24000;
|
const sampleRate = payload.sampleRate || 24000;
|
||||||
|
|||||||
@@ -29,6 +29,11 @@ class UpdateService {
|
|||||||
private downloading = false;
|
private downloading = false;
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
|
// Beim Start alte APK-Reste aus dem Cache wegraeumen — wenn diese App
|
||||||
|
// laeuft, sind frueher heruntergeladene APKs entweder schon installiert
|
||||||
|
// oder unvollstaendig gewesen. Spart sonst pro Update 20-30MB auf dem Handy.
|
||||||
|
this.cleanupOldApks().catch(() => {});
|
||||||
|
|
||||||
// Auf update_available Nachrichten lauschen
|
// Auf update_available Nachrichten lauschen
|
||||||
rvs.onMessage((msg: RVSMessage) => {
|
rvs.onMessage((msg: RVSMessage) => {
|
||||||
if (msg.type === 'update_available' as any) {
|
if (msg.type === 'update_available' as any) {
|
||||||
@@ -45,6 +50,30 @@ class UpdateService {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Raeumt alte heruntergeladene APK-Dateien aus dem Cache auf. */
|
||||||
|
private async cleanupOldApks(): Promise<void> {
|
||||||
|
try {
|
||||||
|
const files = await RNFS.readDir(RNFS.CachesDirectoryPath);
|
||||||
|
const apks = files.filter(f => /\.apk$/i.test(f.name));
|
||||||
|
let freed = 0;
|
||||||
|
for (const f of apks) {
|
||||||
|
try {
|
||||||
|
const size = parseInt(f.size as any, 10) || 0;
|
||||||
|
await RNFS.unlink(f.path);
|
||||||
|
freed += size;
|
||||||
|
console.log(`[Update] Alte APK geloescht: ${f.name} (${(size / 1024 / 1024).toFixed(1)}MB)`);
|
||||||
|
} catch (err: any) {
|
||||||
|
console.warn(`[Update] APK-Loeschen fehlgeschlagen: ${f.name} (${err?.message || err})`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (apks.length > 0) {
|
||||||
|
console.log(`[Update] Cleanup fertig: ${apks.length} APKs entfernt, ${(freed / 1024 / 1024).toFixed(1)}MB freigegeben`);
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
console.warn(`[Update] Cleanup-Fehler: ${err?.message || err}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Bei App-Start Update pruefen */
|
/** Bei App-Start Update pruefen */
|
||||||
checkForUpdate(): void {
|
checkForUpdate(): void {
|
||||||
if (this.checking) return;
|
if (this.checking) return;
|
||||||
@@ -111,6 +140,10 @@ class UpdateService {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Vor dem Schreiben alte APKs im Cache wegraeumen — falls mehrere
|
||||||
|
// Updates in einer Session gezogen werden
|
||||||
|
await this.cleanupOldApks();
|
||||||
|
|
||||||
// Base64 als APK-Datei speichern
|
// Base64 als APK-Datei speichern
|
||||||
const destPath = `${RNFS.CachesDirectoryPath}/${apkData.fileName}`;
|
const destPath = `${RNFS.CachesDirectoryPath}/${apkData.fileName}`;
|
||||||
await RNFS.writeFile(destPath, apkData.base64, 'base64');
|
await RNFS.writeFile(destPath, apkData.base64, 'base64');
|
||||||
|
|||||||
@@ -90,12 +90,32 @@ class WakeWordService {
|
|||||||
if (this.initInProgress) return this.initInProgress;
|
if (this.initInProgress) return this.initInProgress;
|
||||||
this.initInProgress = (async () => {
|
this.initInProgress = (async () => {
|
||||||
try {
|
try {
|
||||||
const { PorcupineManager } = require('@picovoice/porcupine-react-native');
|
const porcupineRN = require('@picovoice/porcupine-react-native');
|
||||||
// Built-In Keyword-Identifier sind lower-case strings im SDK
|
const { PorcupineManager, BuiltInKeywords } = porcupineRN;
|
||||||
|
// Manche Porcupine-Versionen wollen das BuiltInKeywords-Enum (Objekt
|
||||||
|
// mit keys wie JARVIS, COMPUTER, HEY_GOOGLE), andere akzeptieren
|
||||||
|
// den String direkt. Mappen mit Fallback auf String:
|
||||||
|
const enumKey = this.keyword.toUpperCase().replace(/\s+/g, '_');
|
||||||
|
const kw = (BuiltInKeywords && BuiltInKeywords[enumKey]) || this.keyword;
|
||||||
|
console.log('[WakeWord] Porcupine init: keyword=%s (resolved=%s)',
|
||||||
|
this.keyword, typeof kw === 'string' ? kw : '[enum]');
|
||||||
this.porcupine = await PorcupineManager.fromBuiltInKeywords(
|
this.porcupine = await PorcupineManager.fromBuiltInKeywords(
|
||||||
this.accessKey,
|
this.accessKey,
|
||||||
[this.keyword],
|
[kw],
|
||||||
(_keywordIndex: number) => this.onWakeDetected(),
|
(keywordIndex: number) => {
|
||||||
|
console.log('[WakeWord] Porcupine callback fired (index=%d)', keywordIndex);
|
||||||
|
this.onWakeDetected().catch(err =>
|
||||||
|
console.warn('[WakeWord] onWakeDetected crashed:', err));
|
||||||
|
},
|
||||||
|
// Error handler (wenn Porcupine im Background-Thread crashed,
|
||||||
|
// z.B. beim Audio-Engine-Konflikt mit audio-recorder-player)
|
||||||
|
(error: any) => {
|
||||||
|
console.warn('[WakeWord] Porcupine runtime error:', error?.message || error);
|
||||||
|
// Nicht in Loop crashen — state zurueck auf off damit der User
|
||||||
|
// mit dem Aufnahme-Button wieder normal arbeiten kann
|
||||||
|
this.setState('off');
|
||||||
|
this.disposePorcupine().catch(() => {});
|
||||||
|
},
|
||||||
);
|
);
|
||||||
console.log('[WakeWord] Porcupine init OK (keyword=%s)', this.keyword);
|
console.log('[WakeWord] Porcupine init OK (keyword=%s)', this.keyword);
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
+31
-1
@@ -541,6 +541,9 @@ class ARIABridge:
|
|||||||
# Wird fuer die direkt folgende ARIA-Antwort genutzt und dann zurueckgesetzt.
|
# Wird fuer die direkt folgende ARIA-Antwort genutzt und dann zurueckgesetzt.
|
||||||
# So kann jedes Geraet seine bevorzugte Stimme bekommen (pro Request).
|
# So kann jedes Geraet seine bevorzugte Stimme bekommen (pro Request).
|
||||||
self._next_voice_override: Optional[str] = None
|
self._next_voice_override: Optional[str] = None
|
||||||
|
# Gleiche Logik fuer die Wiedergabegeschwindigkeit (F5-TTS speed-Param,
|
||||||
|
# App-Setting aria_tts_speed, 1.0 = normal).
|
||||||
|
self._next_speed_override: Optional[float] = None
|
||||||
# STT-Requests die aktuell auf Antwort von der whisper-bridge (Gamebox) warten.
|
# STT-Requests die aktuell auf Antwort von der whisper-bridge (Gamebox) warten.
|
||||||
# requestId → Future mit dem Text (oder None bei Fehler).
|
# requestId → Future mit dem Text (oder None bei Fehler).
|
||||||
self._pending_stt: dict[str, asyncio.Future] = {}
|
self._pending_stt: dict[str, asyncio.Future] = {}
|
||||||
@@ -911,6 +914,12 @@ class ARIABridge:
|
|||||||
logger.info("[core] Nutze Voice-Override: %s", self._next_voice_override)
|
logger.info("[core] Nutze Voice-Override: %s", self._next_voice_override)
|
||||||
self._next_voice_override = None
|
self._next_voice_override = None
|
||||||
|
|
||||||
|
# Speed ebenfalls aus App-Override nehmen (fallback 1.0)
|
||||||
|
xtts_speed = self._next_speed_override or 1.0
|
||||||
|
if self._next_speed_override:
|
||||||
|
logger.info("[core] Nutze Speed-Override: %.2fx", self._next_speed_override)
|
||||||
|
self._next_speed_override = None
|
||||||
|
|
||||||
tts_text = tts_text_preview or text
|
tts_text = tts_text_preview or text
|
||||||
if not tts_text:
|
if not tts_text:
|
||||||
logger.info("[core] TTS-Text leer nach Cleanup — uebersprungen")
|
logger.info("[core] TTS-Text leer nach Cleanup — uebersprungen")
|
||||||
@@ -926,6 +935,7 @@ class ARIABridge:
|
|||||||
"payload": {
|
"payload": {
|
||||||
"text": tts_text,
|
"text": tts_text,
|
||||||
"voice": xtts_voice,
|
"voice": xtts_voice,
|
||||||
|
"speed": xtts_speed,
|
||||||
"language": "de",
|
"language": "de",
|
||||||
"requestId": xtts_request_id,
|
"requestId": xtts_request_id,
|
||||||
"messageId": message_id,
|
"messageId": message_id,
|
||||||
@@ -1163,6 +1173,13 @@ class ARIABridge:
|
|||||||
if voice_override:
|
if voice_override:
|
||||||
self._next_voice_override = voice_override
|
self._next_voice_override = voice_override
|
||||||
logger.info("[rvs] Voice-Override fuer naechste Antwort: %s", voice_override)
|
logger.info("[rvs] Voice-Override fuer naechste Antwort: %s", voice_override)
|
||||||
|
# Speed-Override (TTS-Wiedergabegeschwindigkeit, pro Geraet)
|
||||||
|
try:
|
||||||
|
speed = float(payload.get("speed", 0) or 0)
|
||||||
|
if 0.1 <= speed <= 5.0:
|
||||||
|
self._next_speed_override = speed
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
pass
|
||||||
if text:
|
if text:
|
||||||
logger.info("[rvs] App-Chat: '%s'", text[:80])
|
logger.info("[rvs] App-Chat: '%s'", text[:80])
|
||||||
await self.send_to_core(text, source="app")
|
await self.send_to_core(text, source="app")
|
||||||
@@ -1215,8 +1232,14 @@ class ARIABridge:
|
|||||||
if not text:
|
if not text:
|
||||||
return
|
return
|
||||||
tts_text = clean_text_for_tts(text) or text
|
tts_text = clean_text_for_tts(text) or text
|
||||||
# Voice aus App-Payload gewinnt, sonst global
|
# Voice + Speed aus App-Payload gewinnen, sonst global/default
|
||||||
xtts_voice = payload.get("voice", "") or getattr(self, 'xtts_voice', '')
|
xtts_voice = payload.get("voice", "") or getattr(self, 'xtts_voice', '')
|
||||||
|
try:
|
||||||
|
xtts_speed = float(payload.get("speed", 0) or 0)
|
||||||
|
if not (0.1 <= xtts_speed <= 5.0):
|
||||||
|
xtts_speed = 1.0
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
xtts_speed = 1.0
|
||||||
try:
|
try:
|
||||||
xtts_request_id = str(uuid.uuid4())
|
xtts_request_id = str(uuid.uuid4())
|
||||||
if message_id:
|
if message_id:
|
||||||
@@ -1226,6 +1249,7 @@ class ARIABridge:
|
|||||||
"payload": {
|
"payload": {
|
||||||
"text": tts_text,
|
"text": tts_text,
|
||||||
"voice": xtts_voice,
|
"voice": xtts_voice,
|
||||||
|
"speed": xtts_speed,
|
||||||
"language": "de",
|
"language": "de",
|
||||||
"requestId": xtts_request_id,
|
"requestId": xtts_request_id,
|
||||||
"messageId": message_id,
|
"messageId": message_id,
|
||||||
@@ -1424,6 +1448,12 @@ class ARIABridge:
|
|||||||
if voice_override:
|
if voice_override:
|
||||||
self._next_voice_override = voice_override
|
self._next_voice_override = voice_override
|
||||||
logger.info("[rvs] Voice-Override (via Audio): %s", voice_override)
|
logger.info("[rvs] Voice-Override (via Audio): %s", voice_override)
|
||||||
|
try:
|
||||||
|
speed = float(payload.get("speed", 0) or 0)
|
||||||
|
if 0.1 <= speed <= 5.0:
|
||||||
|
self._next_speed_override = speed
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
pass
|
||||||
logger.info("[rvs] Audio empfangen: %s, %dms, %dKB",
|
logger.info("[rvs] Audio empfangen: %s, %dms, %dKB",
|
||||||
mime_type, duration_ms, len(audio_b64) // 1365)
|
mime_type, duration_ms, len(audio_b64) // 1365)
|
||||||
asyncio.create_task(self._process_app_audio(audio_b64, mime_type))
|
asyncio.create_task(self._process_app_audio(audio_b64, mime_type))
|
||||||
|
|||||||
+106
-1
@@ -136,6 +136,34 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Voice-Preview Modal -->
|
||||||
|
<div id="voice-preview-modal" style="display:none;position:fixed;inset:0;z-index:1000;background:rgba(0,0,0,0.7);align-items:center;justify-content:center;">
|
||||||
|
<div style="background:#1A1A2E;border:1px solid #2A2A3E;border-radius:10px;padding:20px;max-width:560px;width:90%;display:flex;flex-direction:column;gap:12px;">
|
||||||
|
<div style="display:flex;align-items:center;justify-content:space-between;">
|
||||||
|
<h3 style="margin:0;color:#fff;">Stimmen-Preview: <span id="voice-preview-name">—</span></h3>
|
||||||
|
<button onclick="closeVoicePreview()" style="background:none;border:none;color:#8888AA;font-size:22px;cursor:pointer;">×</button>
|
||||||
|
</div>
|
||||||
|
<textarea id="voice-preview-text" rows="4"
|
||||||
|
style="background:#0D0D1A;border:1px solid #2A2A3E;border-radius:6px;padding:10px;color:#fff;font-size:13px;resize:vertical;"></textarea>
|
||||||
|
|
||||||
|
<div style="display:flex;align-items:center;gap:10px;font-size:12px;color:#8888AA;">
|
||||||
|
<span style="min-width:120px;">Geschwindigkeit:</span>
|
||||||
|
<button onclick="adjustPreviewSpeed(-0.1)" class="btn secondary" style="padding:4px 10px;font-size:12px;">−0.1</button>
|
||||||
|
<span id="voice-preview-speed-value" style="min-width:52px;text-align:center;color:#fff;font-weight:600;">1.0 x</span>
|
||||||
|
<button onclick="adjustPreviewSpeed(0.1)" class="btn secondary" style="padding:4px 10px;font-size:12px;">+0.1</button>
|
||||||
|
<span style="color:#555570;font-size:11px;">(nur fuer dieses Modal, wird nicht gespeichert)</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div style="display:flex;gap:8px;align-items:center;">
|
||||||
|
<button id="voice-preview-play" onclick="playVoicePreview()" class="btn primary" style="padding:8px 16px;">
|
||||||
|
▶ Abspielen
|
||||||
|
</button>
|
||||||
|
<span id="voice-preview-status" style="color:#8888AA;font-size:11px;flex:1;"></span>
|
||||||
|
</div>
|
||||||
|
<audio id="voice-preview-audio" controls style="width:100%;display:none;"></audio>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- Disk-Space Warnung (dynamisch gesetzt) -->
|
<!-- Disk-Space Warnung (dynamisch gesetzt) -->
|
||||||
<div id="disk-banner" style="display:none;position:sticky;top:0;z-index:500;padding:10px 14px;border-radius:0;margin:-16px -16px 12px -16px;font-size:13px;">
|
<div id="disk-banner" style="display:none;position:sticky;top:0;z-index:500;padding:10px 14px;border-radius:0;margin:-16px -16px 12px -16px;font-size:13px;">
|
||||||
<div style="display:flex;align-items:center;gap:10px;flex-wrap:wrap;">
|
<div style="display:flex;align-items:center;gap:10px;flex-wrap:wrap;">
|
||||||
@@ -930,6 +958,24 @@
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (msg.type === 'voice_preview_audio') {
|
||||||
|
const statusEl = document.getElementById('voice-preview-status');
|
||||||
|
const audio = document.getElementById('voice-preview-audio');
|
||||||
|
const playBtn = document.getElementById('voice-preview-play');
|
||||||
|
if (playBtn) playBtn.disabled = false;
|
||||||
|
if (msg.error) {
|
||||||
|
if (statusEl) statusEl.textContent = '❌ Fehler: ' + msg.error;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (msg.base64 && audio) {
|
||||||
|
audio.src = 'data:audio/wav;base64,' + msg.base64;
|
||||||
|
audio.style.display = 'block';
|
||||||
|
audio.play().catch(() => {});
|
||||||
|
if (statusEl) statusEl.textContent = '✅ fertig';
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (msg.type === 'voice_ready') {
|
if (msg.type === 'voice_ready') {
|
||||||
const v = msg.payload?.voice || '';
|
const v = msg.payload?.voice || '';
|
||||||
const err = msg.payload?.error;
|
const err = msg.payload?.error;
|
||||||
@@ -1579,16 +1625,75 @@
|
|||||||
html += '<div style="display:flex;flex-direction:column;gap:4px;">';
|
html += '<div style="display:flex;flex-direction:column;gap:4px;">';
|
||||||
for (const v of voices) {
|
for (const v of voices) {
|
||||||
const esc = (s) => String(s).replace(/[&<>"']/g, c => ({ "&":"&", "<":"<", ">":">", '"':""", "'":"'" }[c]));
|
const esc = (s) => String(s).replace(/[&<>"']/g, c => ({ "&":"&", "<":"<", ">":">", '"':""", "'":"'" }[c]));
|
||||||
|
const jsName = esc(v.name).replace(/'/g, "\\'");
|
||||||
html += `<div style="display:flex;align-items:center;gap:8px;background:#1E1E2E;border-radius:4px;padding:4px 8px;font-size:12px;">`
|
html += `<div style="display:flex;align-items:center;gap:8px;background:#1E1E2E;border-radius:4px;padding:4px 8px;font-size:12px;">`
|
||||||
+ `<span style="flex:1;color:#E0E0F0;">${esc(v.name)}</span>`
|
+ `<span style="flex:1;color:#E0E0F0;">${esc(v.name)}</span>`
|
||||||
+ `<span style="color:#555570;font-size:10px;">${(v.size/1024).toFixed(0)}KB</span>`
|
+ `<span style="color:#555570;font-size:10px;">${(v.size/1024).toFixed(0)}KB</span>`
|
||||||
+ `<button class="btn secondary" onclick="deleteXttsVoice('${esc(v.name).replace(/'/g, "\\'")}')" style="padding:2px 8px;font-size:10px;color:#FF6B6B;" title="Stimme loeschen">X</button>`
|
+ `<button class="btn secondary" onclick="openVoicePreview('${jsName}')" style="padding:2px 8px;font-size:12px;" title="Stimme anhoeren">▶</button>`
|
||||||
|
+ `<button class="btn secondary" onclick="deleteXttsVoice('${jsName}')" style="padding:2px 8px;font-size:10px;color:#FF6B6B;" title="Stimme loeschen">X</button>`
|
||||||
+ `</div>`;
|
+ `</div>`;
|
||||||
}
|
}
|
||||||
html += '</div>';
|
html += '</div>';
|
||||||
box.innerHTML = html;
|
box.innerHTML = html;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Voice Preview Modal ─────────────────────────
|
||||||
|
const VOICE_PREVIEW_DEFAULT = 'Hallo, ich bin ARIA. Das hier ist ein kleiner Test damit du meine Stimme beurteilen kannst.';
|
||||||
|
const PREVIEW_SPEED_DEFAULT = 1.0;
|
||||||
|
const PREVIEW_SPEED_MIN = 0.1;
|
||||||
|
const PREVIEW_SPEED_MAX = 5.0;
|
||||||
|
let currentPreviewVoice = '';
|
||||||
|
let currentPreviewSpeed = PREVIEW_SPEED_DEFAULT;
|
||||||
|
|
||||||
|
function _refreshPreviewSpeedLabel() {
|
||||||
|
const el = document.getElementById('voice-preview-speed-value');
|
||||||
|
if (el) el.textContent = currentPreviewSpeed.toFixed(1) + ' x';
|
||||||
|
}
|
||||||
|
|
||||||
|
function adjustPreviewSpeed(delta) {
|
||||||
|
const next = Math.round((currentPreviewSpeed + delta) * 10) / 10;
|
||||||
|
if (next < PREVIEW_SPEED_MIN || next > PREVIEW_SPEED_MAX) return;
|
||||||
|
currentPreviewSpeed = next;
|
||||||
|
_refreshPreviewSpeedLabel();
|
||||||
|
}
|
||||||
|
|
||||||
|
function openVoicePreview(name) {
|
||||||
|
currentPreviewVoice = name;
|
||||||
|
// Speed bei jedem Oeffnen zuruecksetzen — bewusst kein persist
|
||||||
|
currentPreviewSpeed = PREVIEW_SPEED_DEFAULT;
|
||||||
|
_refreshPreviewSpeedLabel();
|
||||||
|
document.getElementById('voice-preview-name').textContent = name;
|
||||||
|
// Text bei jedem Oeffnen zuruecksetzen
|
||||||
|
document.getElementById('voice-preview-text').value = VOICE_PREVIEW_DEFAULT;
|
||||||
|
document.getElementById('voice-preview-status').textContent = '';
|
||||||
|
const audio = document.getElementById('voice-preview-audio');
|
||||||
|
audio.style.display = 'none';
|
||||||
|
audio.src = '';
|
||||||
|
document.getElementById('voice-preview-modal').style.display = 'flex';
|
||||||
|
}
|
||||||
|
|
||||||
|
function closeVoicePreview() {
|
||||||
|
document.getElementById('voice-preview-modal').style.display = 'none';
|
||||||
|
const audio = document.getElementById('voice-preview-audio');
|
||||||
|
try { audio.pause(); } catch {}
|
||||||
|
}
|
||||||
|
|
||||||
|
function playVoicePreview() {
|
||||||
|
const text = (document.getElementById('voice-preview-text').value || '').trim();
|
||||||
|
if (!text) {
|
||||||
|
document.getElementById('voice-preview-status').textContent = 'Text leer';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
document.getElementById('voice-preview-status').textContent = '⏳ Rendere...';
|
||||||
|
document.getElementById('voice-preview-play').disabled = true;
|
||||||
|
send({
|
||||||
|
action: 'preview_voice',
|
||||||
|
voice: currentPreviewVoice,
|
||||||
|
text,
|
||||||
|
speed: currentPreviewSpeed,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function deleteXttsVoice(name) {
|
function deleteXttsVoice(name) {
|
||||||
if (!confirm(`Stimme "${name}" endgueltig loeschen?`)) return;
|
if (!confirm(`Stimme "${name}" endgueltig loeschen?`)) return;
|
||||||
send({ action: 'xtts_delete_voice', name });
|
send({ action: 'xtts_delete_voice', name });
|
||||||
|
|||||||
+105
-13
@@ -653,6 +653,9 @@ function connectRVS(forcePlain) {
|
|||||||
log("info", "rvs", `service_status ${svc} ${state}${model ? ` (${model})` : ""}`);
|
log("info", "rvs", `service_status ${svc} ${state}${model ? ` (${model})` : ""}`);
|
||||||
}
|
}
|
||||||
broadcast({ type: "service_status", payload: msg.payload });
|
broadcast({ type: "service_status", payload: msg.payload });
|
||||||
|
} else if (msg.type === "audio_pcm" && msg.payload && _previewPending.size > 0) {
|
||||||
|
// PCM-Chunks einer laufenden Voice-Preview — sammeln + WAV bauen
|
||||||
|
_handlePreviewChunk(msg.payload);
|
||||||
} else {
|
} else {
|
||||||
log("debug", "rvs", `Nachricht: ${JSON.stringify(msg).slice(0, 150)}`);
|
log("debug", "rvs", `Nachricht: ${JSON.stringify(msg).slice(0, 150)}`);
|
||||||
}
|
}
|
||||||
@@ -1439,19 +1442,14 @@ wss.on("connection", (ws) => {
|
|||||||
xttsVoice: msg.xttsVoice || "",
|
xttsVoice: msg.xttsVoice || "",
|
||||||
};
|
};
|
||||||
if (msg.whisperModel !== undefined) voiceConfig.whisperModel = msg.whisperModel;
|
if (msg.whisperModel !== undefined) voiceConfig.whisperModel = msg.whisperModel;
|
||||||
// F5-TTS Tuning-Felder — leere Strings entfernen damit der Default greift
|
// F5-TTS Tuning-Felder — immer mit dem vom User gesendeten Wert setzen,
|
||||||
if (msg.f5ttsModel !== undefined) {
|
// auch leeren String. Leer = "reset auf Hard-Default". Sonst merkt die
|
||||||
if (msg.f5ttsModel) voiceConfig.f5ttsModel = msg.f5ttsModel;
|
// Bridge nicht dass der User den Wert loeschen wollte (absent key war
|
||||||
else delete voiceConfig.f5ttsModel;
|
// vorher 'keep current' semantik → BigVGAN blieb drin obwohl User
|
||||||
}
|
// leer eingetragen hatte).
|
||||||
if (msg.f5ttsCkptFile !== undefined) {
|
if (msg.f5ttsModel !== undefined) voiceConfig.f5ttsModel = msg.f5ttsModel || "";
|
||||||
if (msg.f5ttsCkptFile) voiceConfig.f5ttsCkptFile = msg.f5ttsCkptFile;
|
if (msg.f5ttsCkptFile !== undefined) voiceConfig.f5ttsCkptFile = msg.f5ttsCkptFile || "";
|
||||||
else delete voiceConfig.f5ttsCkptFile;
|
if (msg.f5ttsVocabFile !== undefined) voiceConfig.f5ttsVocabFile = msg.f5ttsVocabFile || "";
|
||||||
}
|
|
||||||
if (msg.f5ttsVocabFile !== undefined) {
|
|
||||||
if (msg.f5ttsVocabFile) voiceConfig.f5ttsVocabFile = msg.f5ttsVocabFile;
|
|
||||||
else delete voiceConfig.f5ttsVocabFile;
|
|
||||||
}
|
|
||||||
if (msg.f5ttsCfgStrength !== undefined && !isNaN(msg.f5ttsCfgStrength)) {
|
if (msg.f5ttsCfgStrength !== undefined && !isNaN(msg.f5ttsCfgStrength)) {
|
||||||
voiceConfig.f5ttsCfgStrength = msg.f5ttsCfgStrength;
|
voiceConfig.f5ttsCfgStrength = msg.f5ttsCfgStrength;
|
||||||
}
|
}
|
||||||
@@ -1470,6 +1468,8 @@ wss.on("connection", (ws) => {
|
|||||||
handleSaveTriggers(ws, msg.triggers || []);
|
handleSaveTriggers(ws, msg.triggers || []);
|
||||||
} else if (msg.action === "test_tts") {
|
} else if (msg.action === "test_tts") {
|
||||||
handleTestTTS(ws, msg.text || "Test");
|
handleTestTTS(ws, msg.text || "Test");
|
||||||
|
} else if (msg.action === "preview_voice") {
|
||||||
|
handleVoicePreview(ws, msg.voice || "", msg.text || "Hallo.", msg.speed);
|
||||||
} else if (msg.action === "check_tts") {
|
} else if (msg.action === "check_tts") {
|
||||||
handleCheckTTS(ws);
|
handleCheckTTS(ws);
|
||||||
} else if (msg.action === "check_desktop") {
|
} else if (msg.action === "check_desktop") {
|
||||||
@@ -1642,6 +1642,98 @@ async function handleSaveTriggers(clientWs, triggers) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ── TTS Diagnose (XTTS) ───────────────────────────────
|
// ── TTS Diagnose (XTTS) ───────────────────────────────
|
||||||
|
// ── Voice Preview ────────────────────────────────────────
|
||||||
|
// Sammelt audio_pcm Chunks einer Preview-Anfrage, baut am Ende eine WAV
|
||||||
|
// und schickt sie base64-kodiert an den Browser-Client.
|
||||||
|
//
|
||||||
|
// Map requestId → { clientWs, chunks: [Buffer], sampleRate, channels }
|
||||||
|
const _previewPending = new Map();
|
||||||
|
|
||||||
|
function _buildWavFromPcm(pcmBuf, sampleRate, channels) {
|
||||||
|
const bitsPerSample = 16;
|
||||||
|
const byteRate = sampleRate * channels * bitsPerSample / 8;
|
||||||
|
const blockAlign = channels * bitsPerSample / 8;
|
||||||
|
const dataSize = pcmBuf.length;
|
||||||
|
const header = Buffer.alloc(44);
|
||||||
|
header.write("RIFF", 0);
|
||||||
|
header.writeUInt32LE(36 + dataSize, 4);
|
||||||
|
header.write("WAVE", 8);
|
||||||
|
header.write("fmt ", 12);
|
||||||
|
header.writeUInt32LE(16, 16); // subchunk1 size
|
||||||
|
header.writeUInt16LE(1, 20); // PCM
|
||||||
|
header.writeUInt16LE(channels, 22);
|
||||||
|
header.writeUInt32LE(sampleRate, 24);
|
||||||
|
header.writeUInt32LE(byteRate, 28);
|
||||||
|
header.writeUInt16LE(blockAlign, 32);
|
||||||
|
header.writeUInt16LE(bitsPerSample, 34);
|
||||||
|
header.write("data", 36);
|
||||||
|
header.writeUInt32LE(dataSize, 40);
|
||||||
|
return Buffer.concat([header, pcmBuf]);
|
||||||
|
}
|
||||||
|
|
||||||
|
function _handlePreviewChunk(payload) {
|
||||||
|
const reqId = payload?.requestId || "";
|
||||||
|
const entry = _previewPending.get(reqId);
|
||||||
|
if (!entry) return;
|
||||||
|
if (payload.base64) {
|
||||||
|
try { entry.chunks.push(Buffer.from(payload.base64, "base64")); } catch {}
|
||||||
|
}
|
||||||
|
if (!entry.sampleRate && payload.sampleRate) entry.sampleRate = payload.sampleRate;
|
||||||
|
if (!entry.channels && payload.channels) entry.channels = payload.channels;
|
||||||
|
if (payload.final) {
|
||||||
|
_previewPending.delete(reqId);
|
||||||
|
try {
|
||||||
|
const pcm = Buffer.concat(entry.chunks);
|
||||||
|
const wav = _buildWavFromPcm(pcm, entry.sampleRate || 24000, entry.channels || 1);
|
||||||
|
const b64 = wav.toString("base64");
|
||||||
|
if (entry.clientWs && entry.clientWs.readyState === 1) {
|
||||||
|
entry.clientWs.send(JSON.stringify({
|
||||||
|
type: "voice_preview_audio",
|
||||||
|
base64: b64,
|
||||||
|
size: wav.length,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
if (entry.clientWs && entry.clientWs.readyState === 1) {
|
||||||
|
entry.clientWs.send(JSON.stringify({
|
||||||
|
type: "voice_preview_audio",
|
||||||
|
error: err.message,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleVoicePreview(clientWs, voice, text, speed) {
|
||||||
|
try {
|
||||||
|
// Speed clampen — Browser-Slider ist 0.1-5.0
|
||||||
|
let spd = parseFloat(speed);
|
||||||
|
if (!isFinite(spd) || spd < 0.1 || spd > 5.0) spd = 1.0;
|
||||||
|
const requestId = crypto.randomUUID();
|
||||||
|
_previewPending.set(requestId, { clientWs, chunks: [], sampleRate: 0, channels: 0 });
|
||||||
|
// Timeout safety net
|
||||||
|
setTimeout(() => {
|
||||||
|
if (_previewPending.has(requestId)) {
|
||||||
|
_previewPending.delete(requestId);
|
||||||
|
if (clientWs && clientWs.readyState === 1) {
|
||||||
|
clientWs.send(JSON.stringify({
|
||||||
|
type: "voice_preview_audio",
|
||||||
|
error: "Timeout (60s) — keine Antwort vom f5tts-bridge",
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 60000);
|
||||||
|
log("info", "server", `Voice-Preview: voice="${voice}" speed=${spd.toFixed(1)}x text="${text.slice(0, 60)}"`);
|
||||||
|
sendToRVS_raw({
|
||||||
|
type: "xtts_request",
|
||||||
|
payload: { text, language: "de", requestId, voice, speed: spd },
|
||||||
|
timestamp: Date.now(),
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
clientWs.send(JSON.stringify({ type: "voice_preview_audio", error: err.message }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function handleTestTTS(clientWs, text) {
|
async function handleTestTTS(clientWs, text) {
|
||||||
try {
|
try {
|
||||||
log("info", "server", `TTS-Test via XTTS: "${text}"`);
|
log("info", "server", `TTS-Test via XTTS: "${text}"`);
|
||||||
|
|||||||
@@ -70,22 +70,34 @@
|
|||||||
- [x] VAD-Stille einstellbar in App-Settings (1.0-8.0s, Default 2.8s)
|
- [x] VAD-Stille einstellbar in App-Settings (1.0-8.0s, Default 2.8s)
|
||||||
- [x] MAX_RECORDING auf 120s — laengere Erklaerungen moeglich
|
- [x] MAX_RECORDING auf 120s — laengere Erklaerungen moeglich
|
||||||
- [x] App: Audioausgabe hoert nicht mehr mitten im Satz auf (playbackHeadPosition wait + Stop-Race fix)
|
- [x] App: Audioausgabe hoert nicht mehr mitten im Satz auf (playbackHeadPosition wait + Stop-Race fix)
|
||||||
|
- [x] F5-TTS: Referenz-WAV-Preprocessing — Loudness-Normalisierung -16 LUFS + Silence-Trim + 10s Clip fuer konsistente Cloning-Quali
|
||||||
|
- [x] F5-TTS: deutsches Fine-Tune (aihpi/F5-TTS-German, Vocos-Variante) via hf:// Pfad in Diagnostic konfigurierbar
|
||||||
|
- [x] Whisper transkribiert Voice-Uploads nicht mehr mit hardcoded "small" — aktuelles Modell wird behalten, kein unnoetiger Modell-Swap
|
||||||
|
- [x] RVS/WebSocket maxPayload 50MB: voice_upload mit WAV als base64 sprengt kein Frame-Limit mehr
|
||||||
|
- [x] Dynamischer STT-Timeout in aria-bridge: 300s waehrend whisper-bridge 'loading', 45s wenn 'ready'
|
||||||
|
- [x] service_status Broadcasts: f5tts/whisper melden Lade-Status, Banner in Diagnostic (unten rechts) + App (oben)
|
||||||
|
- [x] config_request Pattern: Bridges fragen beim Connect die aktuelle Voice-Config an, aria-bridge antwortet
|
||||||
|
- [x] F5-TTS Tuning via Diagnostic (Modell-ID, Checkpoint, cfg_strength, nfe_step) statt ENV-Vars — Hot-Reload bei Modell-Wechsel
|
||||||
|
- [x] Conversation-Window: Gespraechsmodus endet nach X Sekunden Stille (1.0-20.0s, Default 8s, einstellbar in Settings)
|
||||||
|
- [x] Porcupine Wake-Word-Integration in der App (Built-In Keywords + Custom spaeter, per Geraet einstellbar)
|
||||||
|
- [x] HF-Cache als Bind-Mount statt Docker Volume — kein .vhdx-Bloat auf Docker Desktop / Windows
|
||||||
|
- [x] cleanup-windows.ps1 / .bat: VHDX-Cleanup via diskpart (ohne Hyper-V) mit Self-Elevation
|
||||||
|
- [x] App Mute-/Auto-Playback-Bug: Closure-Bug geloest (ttsCanPlayRef live-gespiegelt, nicht mehr stale)
|
||||||
|
- [x] App Zombie-Recording: Ohr-aus kill laufende Aufnahme damit der Aufnahme-Button weiter funktioniert
|
||||||
|
- [x] App Text-Rendering: Nachrichten selektierbar + Autolink fuer URLs/E-Mails/Telefonnummern (Browser/Mail/Dialer)
|
||||||
|
- [x] TTS-Wiedergabegeschwindigkeit pro Geraet einstellbar (Settings → 0.5-2.0x in 0.1-Schritten, Default 1.0)
|
||||||
|
- [x] Diagnostic: Voice-Preview-Modal (Play-Icon vor Delete-X, Textfeld mit Default, WAV im Browser abspielen)
|
||||||
|
|
||||||
## Offen
|
## Offen
|
||||||
|
|
||||||
### Bugs
|
### Bugs
|
||||||
- [ ] NO_REPLY wird als "NO" im Chat angezeigt — sollte still verworfen werden (Token nicht gesaeubert)
|
- [ ] App: Wake-Word "jarvis" triggert nicht zuverlaessig (Porcupine-Debugging via ADB-Logcat ausstehend)
|
||||||
|
- [ ] App: Stuerzt beim Lauschen ab, eventuell bei Nebengeraeuschen (Porcupine + Mic-Race, errorCallback haelt's jetzt zurueck — Dauertest ausstehend)
|
||||||
|
|
||||||
### App Features
|
### App Features
|
||||||
- [ ] Wake Word on-device (Porcupine "ARIA" Keyword, Phase 2 — passives Lauschen)
|
|
||||||
- [ ] Chat-History zuverlaessiger laden (AsyncStorage Race Condition)
|
- [ ] Chat-History zuverlaessiger laden (AsyncStorage Race Condition)
|
||||||
- [ ] Background Audio Service (TTS auch bei minimierter App)
|
- [ ] Background Audio Service (TTS auch bei minimierter App)
|
||||||
|
|
||||||
### TTS / Audio
|
|
||||||
- [ ] Audio-Normalisierung (Lautstaerke zwischen Saetzen/Chunks angleichen)
|
|
||||||
- [ ] F5-TTS: Streaming-Inferenz testen (nativ statt satzweise) wenn ein passendes Backend kommt
|
|
||||||
- [ ] F5-TTS: Optional Deepspeed-Beschleunigung pruefen
|
|
||||||
|
|
||||||
### Architektur
|
### Architektur
|
||||||
- [ ] Bilder: Claude Vision direkt nutzen (aktuell nur Dateipfad an ARIA)
|
- [ ] Bilder: Claude Vision direkt nutzen (aktuell nur Dateipfad an ARIA)
|
||||||
- [ ] Auto-Compacting und Memory/Brain Verwaltung (SQLite?)
|
- [ ] Auto-Compacting und Memory/Brain Verwaltung (SQLite?)
|
||||||
|
|||||||
+56
-13
@@ -175,10 +175,31 @@ class F5Runner:
|
|||||||
|
|
||||||
async def update_config(self, payload: dict) -> None:
|
async def update_config(self, payload: dict) -> None:
|
||||||
"""Liest f5tts*-Felder aus einem config-Broadcast.
|
"""Liest f5tts*-Felder aus einem config-Broadcast.
|
||||||
Bei Modell-relevantem Wechsel wird neu geladen."""
|
Bei Modell-relevantem Wechsel wird neu geladen.
|
||||||
new_model = (payload.get("f5ttsModel") or "").strip() or self.model_id
|
|
||||||
new_ckpt = payload.get("f5ttsCkptFile", self.ckpt_file) or ""
|
Semantik:
|
||||||
new_vocab = payload.get("f5ttsVocabFile", self.vocab_file) or ""
|
- key fehlt in payload → aktuellen Wert behalten
|
||||||
|
- key da, nicht-leerer str → diesen Wert nehmen
|
||||||
|
- key da, leerer string → RESET auf Hard-Default (User hat Feld
|
||||||
|
in Diagnostic geleert und Apply geklickt)
|
||||||
|
"""
|
||||||
|
if "f5ttsModel" in payload:
|
||||||
|
v = (payload.get("f5ttsModel") or "").strip()
|
||||||
|
new_model = v if v else DEFAULT_F5TTS_MODEL
|
||||||
|
else:
|
||||||
|
new_model = self.model_id
|
||||||
|
|
||||||
|
if "f5ttsCkptFile" in payload:
|
||||||
|
v = payload.get("f5ttsCkptFile") or ""
|
||||||
|
new_ckpt = v.strip() if isinstance(v, str) else ""
|
||||||
|
else:
|
||||||
|
new_ckpt = self.ckpt_file
|
||||||
|
|
||||||
|
if "f5ttsVocabFile" in payload:
|
||||||
|
v = payload.get("f5ttsVocabFile") or ""
|
||||||
|
new_vocab = v.strip() if isinstance(v, str) else ""
|
||||||
|
else:
|
||||||
|
new_vocab = self.vocab_file
|
||||||
try:
|
try:
|
||||||
new_cfg = float(payload.get("f5ttsCfgStrength", self.cfg_strength))
|
new_cfg = float(payload.get("f5ttsCfgStrength", self.cfg_strength))
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
@@ -216,7 +237,8 @@ class F5Runner:
|
|||||||
else:
|
else:
|
||||||
logger.info("F5-TTS Live-Config: cfg_strength=%.2f nfe=%d", new_cfg, new_nfe)
|
logger.info("F5-TTS Live-Config: cfg_strength=%.2f nfe=%d", new_cfg, new_nfe)
|
||||||
|
|
||||||
def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
|
def _infer_blocking(self, gen_text: str, ref_wav: str, ref_text: str,
|
||||||
|
speed: float = 1.0) -> tuple[np.ndarray, int]:
|
||||||
wav, sr, _ = self.model.infer(
|
wav, sr, _ = self.model.infer(
|
||||||
ref_file=ref_wav,
|
ref_file=ref_wav,
|
||||||
ref_text=ref_text,
|
ref_text=ref_text,
|
||||||
@@ -225,6 +247,7 @@ class F5Runner:
|
|||||||
seed=-1,
|
seed=-1,
|
||||||
cfg_strength=self.cfg_strength,
|
cfg_strength=self.cfg_strength,
|
||||||
nfe_step=self.nfe_step,
|
nfe_step=self.nfe_step,
|
||||||
|
speed=speed,
|
||||||
)
|
)
|
||||||
# F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard
|
# F5-TTS gibt float32 1D-Array — auf 24kHz sample-rate standard
|
||||||
if not isinstance(wav, np.ndarray):
|
if not isinstance(wav, np.ndarray):
|
||||||
@@ -233,10 +256,11 @@ class F5Runner:
|
|||||||
wav = wav.squeeze()
|
wav = wav.squeeze()
|
||||||
return wav.astype(np.float32), int(sr)
|
return wav.astype(np.float32), int(sr)
|
||||||
|
|
||||||
async def synthesize(self, gen_text: str, ref_wav: str, ref_text: str) -> tuple[np.ndarray, int]:
|
async def synthesize(self, gen_text: str, ref_wav: str, ref_text: str,
|
||||||
|
speed: float = 1.0) -> tuple[np.ndarray, int]:
|
||||||
await self.ensure_loaded()
|
await self.ensure_loaded()
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
return await loop.run_in_executor(None, self._infer_blocking, gen_text, ref_wav, ref_text)
|
return await loop.run_in_executor(None, self._infer_blocking, gen_text, ref_wav, ref_text, speed)
|
||||||
|
|
||||||
|
|
||||||
# ── Helpers ─────────────────────────────────────────────────
|
# ── Helpers ─────────────────────────────────────────────────
|
||||||
@@ -268,7 +292,15 @@ def split_sentences(text: str, max_len: int = 350) -> list[str]:
|
|||||||
|
|
||||||
|
|
||||||
def float_to_pcm16(wav: np.ndarray) -> bytes:
|
def float_to_pcm16(wav: np.ndarray) -> bytes:
|
||||||
"""Float32 (-1..+1) → int16 little-endian bytes."""
|
"""Float32 (-1..+1) → int16 little-endian bytes.
|
||||||
|
|
||||||
|
F5-TTS generiert gelegentlich NaN/Inf bei Instabilitaeten — ohne sanitize
|
||||||
|
waere der Cast zu int16 undefiniert (RuntimeWarning + kaputter Sound).
|
||||||
|
"""
|
||||||
|
nan_count = int(np.isnan(wav).sum() + np.isinf(wav).sum())
|
||||||
|
if nan_count > 0:
|
||||||
|
logger.warning("F5-TTS Output enthaelt %d NaN/Inf samples — ersetze mit 0", nan_count)
|
||||||
|
wav = np.nan_to_num(wav, nan=0.0, posinf=1.0, neginf=-1.0)
|
||||||
wav = np.clip(wav, -1.0, 1.0)
|
wav = np.clip(wav, -1.0, 1.0)
|
||||||
pcm = (wav * 32767.0).astype(np.int16)
|
pcm = (wav * 32767.0).astype(np.int16)
|
||||||
return pcm.tobytes()
|
return pcm.tobytes()
|
||||||
@@ -366,7 +398,10 @@ async def request_transcription(ws, wav_path: Path, language: str = "de") -> Opt
|
|||||||
"requestId": request_id,
|
"requestId": request_id,
|
||||||
"audio": audio_b64,
|
"audio": audio_b64,
|
||||||
"mimeType": "audio/wav",
|
"mimeType": "audio/wav",
|
||||||
"model": "small", # klein reicht fuer Voice-Referenz
|
# KEIN hardcoded model — whisper-bridge nimmt das bereits
|
||||||
|
# geladene. Sonst wuerde hier ein Swap auf 'small' passieren und
|
||||||
|
# danach muesste das in Diagnostic konfigurierte Modell (z.B.
|
||||||
|
# large-v3) wieder geladen werden → doppelter Download.
|
||||||
"language": language,
|
"language": language,
|
||||||
})
|
})
|
||||||
return await asyncio.wait_for(fut, timeout=_STT_TIMEOUT_S)
|
return await asyncio.wait_for(fut, timeout=_STT_TIMEOUT_S)
|
||||||
@@ -389,9 +424,9 @@ _tts_queue: asyncio.Queue[tuple] = asyncio.Queue()
|
|||||||
async def _tts_worker(ws, runner: F5Runner) -> None:
|
async def _tts_worker(ws, runner: F5Runner) -> None:
|
||||||
"""Serialisiert Synthesen — GPU kann sonst OOM gehen."""
|
"""Serialisiert Synthesen — GPU kann sonst OOM gehen."""
|
||||||
while True:
|
while True:
|
||||||
text, voice, request_id, message_id, language = await _tts_queue.get()
|
text, voice, request_id, message_id, language, speed = await _tts_queue.get()
|
||||||
try:
|
try:
|
||||||
await _do_tts(ws, runner, text, voice, request_id, message_id, language)
|
await _do_tts(ws, runner, text, voice, request_id, message_id, language, speed)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("TTS-Worker Fehler")
|
logger.exception("TTS-Worker Fehler")
|
||||||
finally:
|
finally:
|
||||||
@@ -399,7 +434,8 @@ async def _tts_worker(ws, runner: F5Runner) -> None:
|
|||||||
|
|
||||||
|
|
||||||
async def _do_tts(ws, runner: F5Runner, text: str, voice: str,
|
async def _do_tts(ws, runner: F5Runner, text: str, voice: str,
|
||||||
request_id: str, message_id: str, language: str) -> None:
|
request_id: str, message_id: str, language: str,
|
||||||
|
speed: float = 1.0) -> None:
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
ref_wav_path, ref_txt_path = voice_paths(voice) if voice else (None, None)
|
ref_wav_path, ref_txt_path = voice_paths(voice) if voice else (None, None)
|
||||||
|
|
||||||
@@ -477,7 +513,7 @@ async def _do_tts(ws, runner: F5Runner, text: str, voice: str,
|
|||||||
pcm_sr = TARGET_SR
|
pcm_sr = TARGET_SR
|
||||||
for i, sent in enumerate(sentences):
|
for i, sent in enumerate(sentences):
|
||||||
try:
|
try:
|
||||||
wav, sr = await runner.synthesize(sent, ref_wav_str, ref_text)
|
wav, sr = await runner.synthesize(sent, ref_wav_str, ref_text, speed)
|
||||||
pcm_sr = sr
|
pcm_sr = sr
|
||||||
pcm_bytes = float_to_pcm16(wav)
|
pcm_bytes = float_to_pcm16(wav)
|
||||||
# Erste PCM-Chunk des allerersten Satzes bekommt Fade-In (maskiert
|
# Erste PCM-Chunk des allerersten Satzes bekommt Fade-In (maskiert
|
||||||
@@ -722,12 +758,19 @@ async def run_loop(runner: F5Runner) -> None:
|
|||||||
payload = msg.get("payload", {}) or {}
|
payload = msg.get("payload", {}) or {}
|
||||||
|
|
||||||
if mtype == "xtts_request":
|
if mtype == "xtts_request":
|
||||||
|
try:
|
||||||
|
speed = float(payload.get("speed") or 1.0)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
speed = 1.0
|
||||||
|
if not (0.1 <= speed <= 5.0):
|
||||||
|
speed = 1.0
|
||||||
await _tts_queue.put((
|
await _tts_queue.put((
|
||||||
payload.get("text", ""),
|
payload.get("text", ""),
|
||||||
payload.get("voice", "") or "",
|
payload.get("voice", "") or "",
|
||||||
payload.get("requestId", ""),
|
payload.get("requestId", ""),
|
||||||
payload.get("messageId", ""),
|
payload.get("messageId", ""),
|
||||||
payload.get("language", "de"),
|
payload.get("language", "de"),
|
||||||
|
speed,
|
||||||
))
|
))
|
||||||
elif mtype == "voice_upload":
|
elif mtype == "voice_upload":
|
||||||
asyncio.create_task(handle_voice_upload(ws, payload))
|
asyncio.create_task(handle_voice_upload(ws, payload))
|
||||||
|
|||||||
@@ -143,7 +143,11 @@ async def handle_stt_request(ws, payload: dict, runner: WhisperRunner) -> None:
|
|||||||
request_id = payload.get("requestId", "")
|
request_id = payload.get("requestId", "")
|
||||||
audio_b64 = payload.get("audio", "")
|
audio_b64 = payload.get("audio", "")
|
||||||
mime_type = payload.get("mimeType", "audio/mp4")
|
mime_type = payload.get("mimeType", "audio/mp4")
|
||||||
model = payload.get("model") or WHISPER_MODEL
|
# Modell-Auswahl:
|
||||||
|
# payload.model gesetzt → nimm das (aria-bridge sendet's basierend auf Config)
|
||||||
|
# sonst + Modell geladen → behalt das aktuelle (kein sinnloser Swap)
|
||||||
|
# sonst → fallback auf ENV-Default
|
||||||
|
model = payload.get("model") or (runner.model_size if runner.model is not None else WHISPER_MODEL)
|
||||||
language = payload.get("language") or WHISPER_LANGUAGE
|
language = payload.get("language") or WHISPER_LANGUAGE
|
||||||
|
|
||||||
if not audio_b64:
|
if not audio_b64:
|
||||||
|
|||||||
Reference in New Issue
Block a user