Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c8881f9e4d | |||
| 028e3b2240 | |||
| c042f27106 | |||
| 4ceadf8be5 | |||
| ddd30b3059 | |||
| 6c8ba5fe2d | |||
| 32ddac002f |
@@ -79,8 +79,8 @@ android {
|
||||
applicationId "com.ariacockpit"
|
||||
minSdkVersion rootProject.ext.minSdkVersion
|
||||
targetSdkVersion rootProject.ext.targetSdkVersion
|
||||
versionCode 407
|
||||
versionName "0.0.4.7"
|
||||
versionCode 409
|
||||
versionName "0.0.4.9"
|
||||
// Fallback fuer Libraries mit Product Flavors
|
||||
missingDimensionStrategy 'react-native-camera', 'general'
|
||||
}
|
||||
|
||||
@@ -30,9 +30,10 @@ import java.util.concurrent.LinkedBlockingQueue
|
||||
class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||
companion object {
|
||||
private const val TAG = "PcmStreamPlayer"
|
||||
// Sekunden Audio die VOR play()-Start gepuffert sein muessen.
|
||||
// 2.5s Vorrat = genug um XTTS-Render-Pausen zwischen Chunks zu puffern.
|
||||
private const val PREROLL_SECONDS = 2.5
|
||||
// Fallback wenn JS keinen Wert uebergibt.
|
||||
private const val DEFAULT_PREROLL_SECONDS = 3.5
|
||||
private const val MIN_PREROLL_SECONDS = 0.5
|
||||
private const val MAX_PREROLL_SECONDS = 10.0
|
||||
// Stille am Stream-Anfang, damit AudioTrack sauber anfaehrt und die
|
||||
// ersten Samples nicht abgeschnitten werden (XTTS-Warmup + play()-Latenz).
|
||||
private const val LEADING_SILENCE_SECONDS = 0.2
|
||||
@@ -53,17 +54,21 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
||||
// ── Lifecycle ──
|
||||
|
||||
@ReactMethod
|
||||
fun start(sampleRate: Int, channels: Int, promise: Promise) {
|
||||
fun start(sampleRate: Int, channels: Int, prerollSeconds: Double, promise: Promise) {
|
||||
try {
|
||||
// Alte Session beenden falls vorhanden
|
||||
stopInternal()
|
||||
|
||||
val prerollSec = prerollSeconds
|
||||
.coerceIn(MIN_PREROLL_SECONDS, MAX_PREROLL_SECONDS)
|
||||
.let { if (it.isFinite() && it > 0) it else DEFAULT_PREROLL_SECONDS }
|
||||
|
||||
val channelConfig = if (channels == 2) AudioFormat.CHANNEL_OUT_STEREO else AudioFormat.CHANNEL_OUT_MONO
|
||||
val encoding = AudioFormat.ENCODING_PCM_16BIT
|
||||
val minBuf = AudioTrack.getMinBufferSize(sampleRate, channelConfig, encoding)
|
||||
val bytesPerSecond = sampleRate * channels * 2 // 16-bit = 2 bytes
|
||||
// Buffer muss mindestens PREROLL + etwas Spielraum fassen.
|
||||
val prerollTarget = (bytesPerSecond * PREROLL_SECONDS).toInt()
|
||||
val prerollTarget = (bytesPerSecond * prerollSec).toInt()
|
||||
val bufferSize = (minBuf * 32).coerceAtLeast(prerollTarget * 2)
|
||||
prerollBytes = prerollTarget
|
||||
bytesBuffered = 0
|
||||
@@ -173,7 +178,7 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
||||
}
|
||||
}, "PcmStreamWriter").apply { start() }
|
||||
|
||||
Log.i(TAG, "Stream gestartet: ${sampleRate}Hz ch=$channels buf=${bufferSize}B preroll=${prerollBytes}B")
|
||||
Log.i(TAG, "Stream gestartet: ${sampleRate}Hz ch=$channels buf=${bufferSize}B preroll=${prerollBytes}B (${prerollSec}s)")
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "start fehlgeschlagen", e)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "aria-cockpit",
|
||||
"version": "0.0.4.7",
|
||||
"version": "0.0.4.9",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"android": "react-native run-android",
|
||||
|
||||
@@ -325,6 +325,15 @@ const ChatScreen: React.FC = () => {
|
||||
const tool = (message.payload.tool as string) || '';
|
||||
setAgentActivity({ activity, tool });
|
||||
}
|
||||
|
||||
// Voice-Config aus Diagnostic — setzt die lokale App-Stimme auf den
|
||||
// gerade in Diagnostic gewaehlten Wert zurueck. User-Wahl in der App
|
||||
// wird dadurch ueberschrieben.
|
||||
if (message.type === ('config' as any)) {
|
||||
const newVoice = ((message.payload as any).xttsVoice as string) ?? '';
|
||||
localXttsVoiceRef.current = newVoice;
|
||||
AsyncStorage.setItem('aria_xtts_voice', newVoice);
|
||||
}
|
||||
});
|
||||
|
||||
const unsubState = rvs.onStateChange((state) => {
|
||||
|
||||
@@ -20,6 +20,12 @@ import AsyncStorage from '@react-native-async-storage/async-storage';
|
||||
import RNFS from 'react-native-fs';
|
||||
import DocumentPicker from 'react-native-document-picker';
|
||||
import rvs, { ConnectionState, RVSMessage, ConnectionConfig, ConnectionLogEntry } from '../services/rvs';
|
||||
import {
|
||||
TTS_PREROLL_DEFAULT_SEC,
|
||||
TTS_PREROLL_MIN_SEC,
|
||||
TTS_PREROLL_MAX_SEC,
|
||||
TTS_PREROLL_STORAGE_KEY,
|
||||
} from '../services/audio';
|
||||
import ModeSelector from '../components/ModeSelector';
|
||||
import QRScanner from '../components/QRScanner';
|
||||
import VoiceCloneModal from '../components/VoiceCloneModal';
|
||||
@@ -73,6 +79,7 @@ const SettingsScreen: React.FC = () => {
|
||||
const [autoDownload, setAutoDownload] = useState(true);
|
||||
const [storageSize, setStorageSize] = useState('...');
|
||||
const [ttsEnabled, setTtsEnabled] = useState(true);
|
||||
const [ttsPrerollSec, setTtsPrerollSec] = useState<number>(TTS_PREROLL_DEFAULT_SEC);
|
||||
const [editingPath, setEditingPath] = useState(false);
|
||||
const [xttsVoice, setXttsVoice] = useState('');
|
||||
const [availableVoices, setAvailableVoices] = useState<Array<{name: string, size: number}>>([]);
|
||||
@@ -99,6 +106,14 @@ const SettingsScreen: React.FC = () => {
|
||||
AsyncStorage.getItem('aria_tts_enabled').then(saved => {
|
||||
if (saved !== null) setTtsEnabled(saved === 'true');
|
||||
});
|
||||
AsyncStorage.getItem(TTS_PREROLL_STORAGE_KEY).then(saved => {
|
||||
if (saved != null) {
|
||||
const n = parseFloat(saved);
|
||||
if (isFinite(n) && n >= TTS_PREROLL_MIN_SEC && n <= TTS_PREROLL_MAX_SEC) {
|
||||
setTtsPrerollSec(n);
|
||||
}
|
||||
}
|
||||
});
|
||||
AsyncStorage.getItem('aria_xtts_voice').then(saved => {
|
||||
if (saved) setXttsVoice(saved);
|
||||
});
|
||||
@@ -250,6 +265,13 @@ const SettingsScreen: React.FC = () => {
|
||||
}
|
||||
rvs.send('xtts_list_voices' as any, {});
|
||||
}
|
||||
|
||||
// Diagnostic-Voice-Wechsel → lokale App-Stimme auf den neuen Default zuruecksetzen
|
||||
if (message.type === ('config' as any)) {
|
||||
const newVoice = ((message.payload as any).xttsVoice as string) ?? '';
|
||||
setXttsVoice(newVoice);
|
||||
AsyncStorage.setItem('aria_xtts_voice', newVoice);
|
||||
}
|
||||
});
|
||||
|
||||
return () => {
|
||||
@@ -527,6 +549,42 @@ const SettingsScreen: React.FC = () => {
|
||||
/>
|
||||
</View>
|
||||
|
||||
{ttsEnabled && (
|
||||
<View style={{marginTop: 20}}>
|
||||
<Text style={styles.toggleLabel}>Puffer vor Wiedergabestart</Text>
|
||||
<Text style={styles.toggleHint}>
|
||||
Wie viel Audio gesammelt wird bevor die Wiedergabe startet.
|
||||
Hoeher = robuster gegen Render-Pausen, aber mehr Startverzoegerung.
|
||||
Default: {TTS_PREROLL_DEFAULT_SEC.toFixed(1)}s.
|
||||
</Text>
|
||||
<View style={styles.prerollRow}>
|
||||
<TouchableOpacity
|
||||
style={styles.prerollButton}
|
||||
onPress={() => {
|
||||
const next = Math.max(TTS_PREROLL_MIN_SEC, Math.round((ttsPrerollSec - 0.5) * 10) / 10);
|
||||
setTtsPrerollSec(next);
|
||||
AsyncStorage.setItem(TTS_PREROLL_STORAGE_KEY, String(next));
|
||||
}}
|
||||
disabled={ttsPrerollSec <= TTS_PREROLL_MIN_SEC}
|
||||
>
|
||||
<Text style={styles.prerollButtonText}>−0.5</Text>
|
||||
</TouchableOpacity>
|
||||
<Text style={styles.prerollValue}>{ttsPrerollSec.toFixed(1)} s</Text>
|
||||
<TouchableOpacity
|
||||
style={styles.prerollButton}
|
||||
onPress={() => {
|
||||
const next = Math.min(TTS_PREROLL_MAX_SEC, Math.round((ttsPrerollSec + 0.5) * 10) / 10);
|
||||
setTtsPrerollSec(next);
|
||||
AsyncStorage.setItem(TTS_PREROLL_STORAGE_KEY, String(next));
|
||||
}}
|
||||
disabled={ttsPrerollSec >= TTS_PREROLL_MAX_SEC}
|
||||
>
|
||||
<Text style={styles.prerollButtonText}>+0.5</Text>
|
||||
</TouchableOpacity>
|
||||
</View>
|
||||
</View>
|
||||
)}
|
||||
|
||||
{ttsEnabled && (
|
||||
<View style={{marginTop: 20}}>
|
||||
<Text style={styles.toggleLabel}>Stimme (geraetelokal)</Text>
|
||||
@@ -1118,6 +1176,34 @@ const styles = StyleSheet.create({
|
||||
bottomSpacer: {
|
||||
height: 40,
|
||||
},
|
||||
|
||||
prerollRow: {
|
||||
flexDirection: 'row',
|
||||
alignItems: 'center',
|
||||
justifyContent: 'center',
|
||||
marginTop: 12,
|
||||
gap: 16,
|
||||
},
|
||||
prerollButton: {
|
||||
backgroundColor: '#2A2A3E',
|
||||
paddingHorizontal: 18,
|
||||
paddingVertical: 10,
|
||||
borderRadius: 8,
|
||||
minWidth: 72,
|
||||
alignItems: 'center',
|
||||
},
|
||||
prerollButtonText: {
|
||||
color: '#FFFFFF',
|
||||
fontSize: 16,
|
||||
fontWeight: '600',
|
||||
},
|
||||
prerollValue: {
|
||||
color: '#FFFFFF',
|
||||
fontSize: 20,
|
||||
fontWeight: '700',
|
||||
minWidth: 80,
|
||||
textAlign: 'center',
|
||||
},
|
||||
});
|
||||
|
||||
export default SettingsScreen;
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
import { Platform, PermissionsAndroid, NativeModules } from 'react-native';
|
||||
import Sound from 'react-native-sound';
|
||||
import RNFS from 'react-native-fs';
|
||||
import AsyncStorage from '@react-native-async-storage/async-storage';
|
||||
import AudioRecorderPlayer, {
|
||||
AudioEncoderAndroidType,
|
||||
AudioSourceAndroidType,
|
||||
@@ -41,7 +42,7 @@ const { AudioFocus, PcmStreamPlayer } = NativeModules as {
|
||||
release: () => Promise<boolean>;
|
||||
};
|
||||
PcmStreamPlayer?: {
|
||||
start: (sampleRate: number, channels: number) => Promise<boolean>;
|
||||
start: (sampleRate: number, channels: number, prerollSeconds: number) => Promise<boolean>;
|
||||
writeChunk: (base64Pcm: string) => Promise<boolean>;
|
||||
end: () => Promise<boolean>;
|
||||
stop: () => Promise<boolean>;
|
||||
@@ -80,6 +81,26 @@ const VAD_SPEECH_MIN_MS = 500; // ms Sprache bevor Aufnahme zaehlt — l
|
||||
// Max-Dauer einer Aufnahme in Gespraechsmodus (Notbremse gegen Runaway-Loops)
|
||||
const MAX_RECORDING_MS = 30000;
|
||||
|
||||
// Pre-Roll: Wie lange Audio im AudioTrack-Buffer liegt bevor play() startet.
|
||||
// Einstellbar via Diagnostic/Settings (Key: aria_tts_preroll_sec).
|
||||
export const TTS_PREROLL_DEFAULT_SEC = 3.5;
|
||||
export const TTS_PREROLL_MIN_SEC = 1.0;
|
||||
export const TTS_PREROLL_MAX_SEC = 6.0;
|
||||
export const TTS_PREROLL_STORAGE_KEY = 'aria_tts_preroll_sec';
|
||||
|
||||
async function loadPrerollSec(): Promise<number> {
|
||||
try {
|
||||
const raw = await AsyncStorage.getItem(TTS_PREROLL_STORAGE_KEY);
|
||||
if (raw != null) {
|
||||
const n = parseFloat(raw);
|
||||
if (isFinite(n) && n >= TTS_PREROLL_MIN_SEC && n <= TTS_PREROLL_MAX_SEC) {
|
||||
return n;
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
return TTS_PREROLL_DEFAULT_SEC;
|
||||
}
|
||||
|
||||
// --- Audio-Service ---
|
||||
|
||||
class AudioService {
|
||||
@@ -373,8 +394,9 @@ class AudioService {
|
||||
this.pcmBuffer = [];
|
||||
this.pcmBytesCollected = 0;
|
||||
if (!silent) {
|
||||
const prerollSec = await loadPrerollSec();
|
||||
try {
|
||||
await PcmStreamPlayer!.start(sampleRate, channels);
|
||||
await PcmStreamPlayer!.start(sampleRate, channels, prerollSec);
|
||||
} catch (err) {
|
||||
console.error('[Audio] PcmStreamPlayer.start fehlgeschlagen:', err);
|
||||
this.pcmStreamActive = false;
|
||||
|
||||
@@ -257,6 +257,12 @@ def clean_text_for_tts(text: str) -> str:
|
||||
for pat, repl in _UNIT_WORDS:
|
||||
t = _re_tts.sub(pat, repl, t)
|
||||
|
||||
# Generisches Buchstabieren: alle verbleibenden 2-5-Zeichen-Grossbuchstaben-Woerter
|
||||
# (XTTS, USB, DNS, JSON, HTML, ...) → "X T T S". Laeuft NACH der expliziten Liste,
|
||||
# damit TTS/GPU/... schon aufgeloest sind. "WLAN"-artige, die als Wort gesprochen
|
||||
# werden, koennen bei Bedarf explizit in _UNIT_WORDS uebersteuert werden.
|
||||
t = _re_tts.sub(r'\b([A-Z]{2,5})\b', lambda m: " ".join(m.group(1)), t)
|
||||
|
||||
# Anfuehrungszeichen
|
||||
t = _re_tts.sub(r'["""„`]', '', t)
|
||||
|
||||
|
||||
+46
-10
@@ -95,6 +95,25 @@ function connectRVS(forcePlain) {
|
||||
|
||||
// ── TTS Request Handler ─────────────────────────────
|
||||
|
||||
/**
|
||||
* Linearer Fade-In auf einen base64-PCM-Chunk (s16le).
|
||||
* Mascht XTTS-Warmup-Glitches am Anfang eines Renders.
|
||||
*/
|
||||
function applyFadeIn(base64Pcm, sampleRate, channels, fadeMs) {
|
||||
const buf = Buffer.from(base64Pcm, "base64");
|
||||
const totalSamples = buf.length / 2; // s16le
|
||||
const fadeSamples = Math.min(
|
||||
Math.floor((fadeMs / 1000) * sampleRate) * channels,
|
||||
totalSamples
|
||||
);
|
||||
for (let i = 0; i < fadeSamples; i++) {
|
||||
const sample = buf.readInt16LE(i * 2);
|
||||
const gain = i / fadeSamples;
|
||||
buf.writeInt16LE(Math.round(sample * gain), i * 2);
|
||||
}
|
||||
return buf.toString("base64");
|
||||
}
|
||||
|
||||
// ── TTS-Queue ──────────────────────────────────────
|
||||
// XTTS verarbeitet Requests sequenziell, damit Streams sich nicht ueberlappen.
|
||||
// Ohne Queue wuerden parallele Requests parallel streamen → App bekommt
|
||||
@@ -132,20 +151,38 @@ async function _runTTSRequest(payload) {
|
||||
log(`TTS-Request (streaming): "${cleanText.slice(0, 80)}..." (${cleanText.length} chars, voice: ${voice || "default"})`);
|
||||
|
||||
try {
|
||||
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
||||
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
||||
// Im local-Mode erwartet daswer123 XTTS speaker_wav als Basename (ohne .wav,
|
||||
// ohne Pfad) — der Server prefixt EXAMPLE_FOLDER selbst. Wir checken hier
|
||||
// nur das physische File ab um Warnungen zu loggen; runter ans API geht
|
||||
// nur der Name.
|
||||
const voiceFilePath = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
||||
const hasCustomVoice = voiceFilePath && fs.existsSync(voiceFilePath);
|
||||
const speakerName = hasCustomVoice ? voice : "";
|
||||
if (voice && !hasCustomVoice) {
|
||||
log(`WARNUNG: Voice "${voice}" angefordert, aber ${voiceFilePath} existiert nicht — nehme Default`);
|
||||
} else if (hasCustomVoice) {
|
||||
log(`Voice "${voice}" verwendet (speaker_wav="${speakerName}")`);
|
||||
}
|
||||
|
||||
let chunkIndex = 0;
|
||||
let pcmMeta = null;
|
||||
let firstChunkSeen = false;
|
||||
|
||||
const onChunk = (pcmBase64, meta) => {
|
||||
if (!pcmMeta) pcmMeta = meta;
|
||||
let outBase64 = pcmBase64;
|
||||
// Fade-In auf den ersten Chunk — maskiert XTTS-Warmup-Glitches
|
||||
// (autoregressiver Generator hat am Anfang wenig Kontext → Artefakte).
|
||||
if (!firstChunkSeen && pcmBase64) {
|
||||
firstChunkSeen = true;
|
||||
outBase64 = applyFadeIn(pcmBase64, meta.sampleRate, meta.channels, 120);
|
||||
}
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: pcmBase64,
|
||||
base64: outBase64,
|
||||
format: "pcm_s16le",
|
||||
sampleRate: meta.sampleRate,
|
||||
channels: meta.channels,
|
||||
@@ -163,7 +200,7 @@ async function _runTTSRequest(payload) {
|
||||
await streamXTTSAsPCM(
|
||||
cleanText,
|
||||
language || "de",
|
||||
hasCustomVoice ? voiceSample : null,
|
||||
speakerName,
|
||||
onChunk,
|
||||
);
|
||||
} catch (streamErr) {
|
||||
@@ -171,7 +208,7 @@ async function _runTTSRequest(payload) {
|
||||
await streamXTTSBatch(
|
||||
cleanText,
|
||||
language || "de",
|
||||
hasCustomVoice ? voiceSample : null,
|
||||
speakerName,
|
||||
onChunk,
|
||||
);
|
||||
}
|
||||
@@ -216,15 +253,14 @@ function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
|
||||
return new Promise((resolve, reject) => {
|
||||
// Wichtig: speaker_wav MUSS als Query-Key dabei sein (Pydantic required) —
|
||||
// auch bei default-voice mit leerem Wert. Sonst gibt's HTTP 422.
|
||||
// stream_chunk_size=100: Kompromiss zwischen first-audio-latency und
|
||||
// gap-risk. Bei RTX 3060 (RTF 1.48) ~3s bis erster Audio, Chunks gross
|
||||
// genug dass der AudioTrack-Buffer (128KB ≈ 2.7s) zwischen Chunks nicht
|
||||
// leerlauft.
|
||||
// stream_chunk_size=250: grosse Chunks = wenige Chunk-Grenzen = wenig
|
||||
// Render-Artefakte. daswer123 erzeugt an Chunk-Boundaries haeufig Glitches
|
||||
// in den Worten die ueber die Grenze gehen. Hoehere Latenz ist OK.
|
||||
const qs = new URLSearchParams();
|
||||
qs.set("text", text);
|
||||
qs.set("language", language || "de");
|
||||
qs.set("speaker_wav", speakerWav || "");
|
||||
qs.set("stream_chunk_size", "100");
|
||||
qs.set("stream_chunk_size", "250");
|
||||
|
||||
const url = new URL(XTTS_API_URL);
|
||||
const fullPath = `/tts_stream?${qs.toString()}`;
|
||||
|
||||
Reference in New Issue
Block a user