feat: Streaming TTS — PCM-Stream statt WAV-Chunks (Weg A)
Pipeline: XTTS-Server → xtts-bridge → aria-bridge → RVS → App AudioTrack XTTS-Bridge (Gaming-PC): - streamXTTSAsPCM(): liest /tts_to_audio/ Response inkrementell, parst WAV-Header (samplerate/channels), teilt PCM in 8KB-Chunks (~170ms bei 24kHz s16 mono) und sendet jeden als audio_pcm. - Finaler Chunk mit final=true nach letztem Text-Chunk aria-bridge: - audio_pcm Handler leitet payload 1:1 weiter, filled messageId aus requestId → messageId Map falls XTTS-Bridge messageId nicht hatte - Alter xtts_response Pfad bleibt als Legacy-Fallback (WAV) RVS: audio_pcm in ALLOWED_TYPES Android Native: - PcmStreamPlayerModule (Kotlin): AudioTrack MODE_STREAM mit Writer-Thread und BlockingQueue. start(rate, ch) / writeChunk(b64) / end() / stop() - 8x MinBufferSize grosszuegig dimensioniert, glatt auch bei Netz-Aussetzern - Registered im MainApplication via PcmStreamPlayerPackage App JS: - audioService.handlePcmChunk(): erkennt neue Session (messageId-Wechsel), started nativen Stream, cached PCM-Bytes pro Message. Bei final=true Stream sauber schliessen + _savePcmBufferAsWav → WAV-File im tts_cache/<messageId>.wav - _savePcmBufferAsWav: baut 44-byte WAV-Header (PCM s16le, korrekte samplerate/channels), haengt alle gesammelten base64-PCM-Chunks an - stopPlayback beendet auch aktiven PCM-Stream - ChatScreen routet type=audio_pcm an handlePcmChunk, bei final setzt audioPath in der Message Play-Button: falls messageId einen audioPath hat → WAV aus Cache (Sound-basiert), egal ob Original-TTS Piper oder XTTS war. Audio-Focus: - requestDuck() beim Stream-Start, release() bei Stream-Ende - Andere Apps (Spotify etc.) werden leiser waehrend ARIA spricht Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
eb12281dfc
commit
6ab6196739
|
|
@ -20,6 +20,7 @@ class MainApplication : Application(), ReactApplication {
|
||||||
PackageList(this).packages.apply {
|
PackageList(this).packages.apply {
|
||||||
add(ApkInstallerPackage())
|
add(ApkInstallerPackage())
|
||||||
add(AudioFocusPackage())
|
add(AudioFocusPackage())
|
||||||
|
add(PcmStreamPlayerPackage())
|
||||||
}
|
}
|
||||||
|
|
||||||
override fun getJSMainModuleName(): String = "index"
|
override fun getJSMainModuleName(): String = "index"
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,158 @@
|
||||||
|
package com.ariacockpit
|
||||||
|
|
||||||
|
import android.media.AudioAttributes
|
||||||
|
import android.media.AudioFormat
|
||||||
|
import android.media.AudioManager
|
||||||
|
import android.media.AudioTrack
|
||||||
|
import android.util.Base64
|
||||||
|
import android.util.Log
|
||||||
|
import com.facebook.react.bridge.Promise
|
||||||
|
import com.facebook.react.bridge.ReactApplicationContext
|
||||||
|
import com.facebook.react.bridge.ReactContextBaseJavaModule
|
||||||
|
import com.facebook.react.bridge.ReactMethod
|
||||||
|
import java.util.concurrent.LinkedBlockingQueue
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Streamt PCM-s16le Audio direkt via AudioTrack MODE_STREAM.
|
||||||
|
*
|
||||||
|
* Flow:
|
||||||
|
* JS: start(sampleRate, channels) → öffnet AudioTrack und startet Writer-Thread
|
||||||
|
* JS: writeChunk(base64) → dekodiert, queued, Writer schreibt non-blocking
|
||||||
|
* JS: end() → wartet bis Queue leer, schließt AudioTrack
|
||||||
|
* JS: stop() → Hart stoppen, Queue leeren (Cancel)
|
||||||
|
*
|
||||||
|
* Vorteil gegenüber Sound-File-Queue:
|
||||||
|
* - Keine Gap zwischen Chunks (AudioTrack puffert intern)
|
||||||
|
* - Erste Samples beginnen zu spielen sobald der erste Chunk da ist
|
||||||
|
* - Kein WAV-Header-Parsing pro Chunk
|
||||||
|
*/
|
||||||
|
class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||||
|
companion object {
|
||||||
|
private const val TAG = "PcmStreamPlayer"
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun getName() = "PcmStreamPlayer"
|
||||||
|
|
||||||
|
private var track: AudioTrack? = null
|
||||||
|
private val queue = LinkedBlockingQueue<ByteArray>()
|
||||||
|
private var writerThread: Thread? = null
|
||||||
|
@Volatile private var writerShouldStop = false
|
||||||
|
@Volatile private var endRequested = false
|
||||||
|
|
||||||
|
// ── Lifecycle ──
|
||||||
|
|
||||||
|
@ReactMethod
|
||||||
|
fun start(sampleRate: Int, channels: Int, promise: Promise) {
|
||||||
|
try {
|
||||||
|
// Alte Session beenden falls vorhanden
|
||||||
|
stopInternal()
|
||||||
|
|
||||||
|
val channelConfig = if (channels == 2) AudioFormat.CHANNEL_OUT_STEREO else AudioFormat.CHANNEL_OUT_MONO
|
||||||
|
val encoding = AudioFormat.ENCODING_PCM_16BIT
|
||||||
|
val minBuf = AudioTrack.getMinBufferSize(sampleRate, channelConfig, encoding)
|
||||||
|
// Etwas grosszuegiger Buffer: 8x MinSize (ca. 200-400ms bei 24kHz) — glatt auch bei kleinen Netzwerk-Aussetzern
|
||||||
|
val bufferSize = (minBuf * 8).coerceAtLeast(32 * 1024)
|
||||||
|
|
||||||
|
val newTrack = AudioTrack.Builder()
|
||||||
|
.setAudioAttributes(
|
||||||
|
AudioAttributes.Builder()
|
||||||
|
.setUsage(AudioAttributes.USAGE_ASSISTANT)
|
||||||
|
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
.setAudioFormat(
|
||||||
|
AudioFormat.Builder()
|
||||||
|
.setSampleRate(sampleRate)
|
||||||
|
.setChannelMask(channelConfig)
|
||||||
|
.setEncoding(encoding)
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
.setBufferSizeInBytes(bufferSize)
|
||||||
|
.setTransferMode(AudioTrack.MODE_STREAM)
|
||||||
|
.build()
|
||||||
|
|
||||||
|
newTrack.play()
|
||||||
|
track = newTrack
|
||||||
|
queue.clear()
|
||||||
|
writerShouldStop = false
|
||||||
|
endRequested = false
|
||||||
|
|
||||||
|
writerThread = Thread({
|
||||||
|
val t = track ?: return@Thread
|
||||||
|
try {
|
||||||
|
while (!writerShouldStop) {
|
||||||
|
val data = queue.poll(50, java.util.concurrent.TimeUnit.MILLISECONDS) ?: run {
|
||||||
|
if (endRequested) return@Thread
|
||||||
|
null
|
||||||
|
} ?: continue
|
||||||
|
var offset = 0
|
||||||
|
while (offset < data.size && !writerShouldStop) {
|
||||||
|
val written = t.write(data, offset, data.size - offset)
|
||||||
|
if (written <= 0) break
|
||||||
|
offset += written
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e: Exception) {
|
||||||
|
Log.w(TAG, "Writer-Thread Fehler: ${e.message}")
|
||||||
|
} finally {
|
||||||
|
try { t.stop() } catch (_: Exception) {}
|
||||||
|
try { t.release() } catch (_: Exception) {}
|
||||||
|
}
|
||||||
|
}, "PcmStreamWriter").apply { start() }
|
||||||
|
|
||||||
|
Log.i(TAG, "Stream gestartet: ${sampleRate}Hz ch=$channels buf=${bufferSize}B")
|
||||||
|
promise.resolve(true)
|
||||||
|
} catch (e: Exception) {
|
||||||
|
Log.e(TAG, "start fehlgeschlagen", e)
|
||||||
|
promise.reject("START_FAILED", e.message, e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ReactMethod
|
||||||
|
fun writeChunk(base64Pcm: String, promise: Promise) {
|
||||||
|
try {
|
||||||
|
if (base64Pcm.isEmpty()) {
|
||||||
|
promise.resolve(true)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
val bytes = Base64.decode(base64Pcm, Base64.DEFAULT)
|
||||||
|
queue.put(bytes)
|
||||||
|
promise.resolve(true)
|
||||||
|
} catch (e: Exception) {
|
||||||
|
promise.reject("WRITE_FAILED", e.message, e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Signalisiert: keine weiteren Chunks. Writer wartet auf Queue-Abschluss, dann stoppt. */
|
||||||
|
@ReactMethod
|
||||||
|
fun end(promise: Promise) {
|
||||||
|
endRequested = true
|
||||||
|
promise.resolve(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Harter Stop (Cancel) — Queue verwerfen. */
|
||||||
|
@ReactMethod
|
||||||
|
fun stop(promise: Promise) {
|
||||||
|
stopInternal()
|
||||||
|
promise.resolve(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun stopInternal() {
|
||||||
|
writerShouldStop = true
|
||||||
|
endRequested = true
|
||||||
|
queue.clear()
|
||||||
|
writerThread?.interrupt()
|
||||||
|
writerThread = null
|
||||||
|
val t = track
|
||||||
|
if (t != null) {
|
||||||
|
try { t.stop() } catch (_: Exception) {}
|
||||||
|
try { t.release() } catch (_: Exception) {}
|
||||||
|
}
|
||||||
|
track = null
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun onCatalystInstanceDestroy() {
|
||||||
|
stopInternal()
|
||||||
|
super.onCatalystInstanceDestroy()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,16 @@
|
||||||
|
package com.ariacockpit
|
||||||
|
|
||||||
|
import com.facebook.react.ReactPackage
|
||||||
|
import com.facebook.react.bridge.NativeModule
|
||||||
|
import com.facebook.react.bridge.ReactApplicationContext
|
||||||
|
import com.facebook.react.uimanager.ViewManager
|
||||||
|
|
||||||
|
class PcmStreamPlayerPackage : ReactPackage {
|
||||||
|
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
|
||||||
|
return listOf(PcmStreamPlayerModule(reactContext))
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
|
||||||
|
return emptyList()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -274,6 +274,20 @@ const ChatScreen: React.FC = () => {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// XTTS PCM-Stream: direkt an AudioTrack, bei final WAV-Cache schreiben
|
||||||
|
if (message.type === ('audio_pcm' as any)) {
|
||||||
|
const p = message.payload as any;
|
||||||
|
const refId = (p.messageId as string) || '';
|
||||||
|
audioService.handlePcmChunk(p).then((audioPath: any) => {
|
||||||
|
// Wenn final + Cache-Pfad zurueckkam, Message aktualisieren
|
||||||
|
if (p.final && audioPath && refId) {
|
||||||
|
setMessages(prev => prev.map(m =>
|
||||||
|
m.messageId === refId ? { ...m, audioPath } : m
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}).catch(() => {});
|
||||||
|
}
|
||||||
|
|
||||||
// Thinking-Indicator Status von der Bridge
|
// Thinking-Indicator Status von der Bridge
|
||||||
if (message.type === 'agent_activity') {
|
if (message.type === 'agent_activity') {
|
||||||
const activity = (message.payload.activity as string) || 'idle';
|
const activity = (message.payload.activity as string) || 'idle';
|
||||||
|
|
|
||||||
|
|
@ -16,13 +16,36 @@ import AudioRecorderPlayer, {
|
||||||
OutputFormatAndroidType,
|
OutputFormatAndroidType,
|
||||||
} from 'react-native-audio-recorder-player';
|
} from 'react-native-audio-recorder-player';
|
||||||
|
|
||||||
|
// Base64-Encoder fuer Binary-Strings (Header-Bytes → Base64)
|
||||||
|
const B64_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';
|
||||||
|
function btoaSafe(bin: string): string {
|
||||||
|
let out = '';
|
||||||
|
const len = bin.length;
|
||||||
|
for (let i = 0; i < len; i += 3) {
|
||||||
|
const b1 = bin.charCodeAt(i) & 0xff;
|
||||||
|
const b2 = i + 1 < len ? bin.charCodeAt(i + 1) & 0xff : 0;
|
||||||
|
const b3 = i + 2 < len ? bin.charCodeAt(i + 2) & 0xff : 0;
|
||||||
|
out += B64_CHARS[b1 >> 2];
|
||||||
|
out += B64_CHARS[((b1 & 0x03) << 4) | (b2 >> 4)];
|
||||||
|
out += i + 1 < len ? B64_CHARS[((b2 & 0x0f) << 2) | (b3 >> 6)] : '=';
|
||||||
|
out += i + 2 < len ? B64_CHARS[b3 & 0x3f] : '=';
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
// Native Module fuer Audio-Focus (Ducking/Muten anderer Apps)
|
// Native Module fuer Audio-Focus (Ducking/Muten anderer Apps)
|
||||||
const { AudioFocus } = NativeModules as {
|
const { AudioFocus, PcmStreamPlayer } = NativeModules as {
|
||||||
AudioFocus?: {
|
AudioFocus?: {
|
||||||
requestDuck: () => Promise<boolean>;
|
requestDuck: () => Promise<boolean>;
|
||||||
requestExclusive: () => Promise<boolean>;
|
requestExclusive: () => Promise<boolean>;
|
||||||
release: () => Promise<boolean>;
|
release: () => Promise<boolean>;
|
||||||
};
|
};
|
||||||
|
PcmStreamPlayer?: {
|
||||||
|
start: (sampleRate: number, channels: number) => Promise<boolean>;
|
||||||
|
writeChunk: (base64Pcm: string) => Promise<boolean>;
|
||||||
|
end: () => Promise<boolean>;
|
||||||
|
stop: () => Promise<boolean>;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
// --- Typen ---
|
// --- Typen ---
|
||||||
|
|
@ -79,6 +102,15 @@ class AudioService {
|
||||||
private speechDetected: boolean = false;
|
private speechDetected: boolean = false;
|
||||||
private speechStartTime: number = 0;
|
private speechStartTime: number = 0;
|
||||||
|
|
||||||
|
// PCM-Stream (XTTS): aktive Session + Cache-Puffer pro messageId
|
||||||
|
private pcmStreamActive: boolean = false;
|
||||||
|
private pcmMessageId: string = '';
|
||||||
|
private pcmSampleRate: number = 24000;
|
||||||
|
private pcmChannels: number = 1;
|
||||||
|
private pcmBuffer: string[] = []; // base64-chunks zum spaeteren WAV-Build
|
||||||
|
private pcmBytesCollected: number = 0;
|
||||||
|
private readonly PCM_MAX_CACHE_BYTES = 30 * 1024 * 1024; // 30MB
|
||||||
|
|
||||||
// VAD State
|
// VAD State
|
||||||
private vadEnabled: boolean = false;
|
private vadEnabled: boolean = false;
|
||||||
private lastSpeechTime: number = 0;
|
private lastSpeechTime: number = 0;
|
||||||
|
|
@ -303,6 +335,141 @@ class AudioService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Einen PCM-Chunk aus einer audio_pcm Nachricht empfangen und spielen/cachen.
|
||||||
|
* Gibt bei final=true den Cache-Pfad zurueck (file://) oder '' wenn nicht gecached. */
|
||||||
|
async handlePcmChunk(payload: {
|
||||||
|
base64: string;
|
||||||
|
sampleRate?: number;
|
||||||
|
channels?: number;
|
||||||
|
messageId?: string;
|
||||||
|
chunk?: number;
|
||||||
|
final?: boolean;
|
||||||
|
}): Promise<string> {
|
||||||
|
if (!PcmStreamPlayer) {
|
||||||
|
console.warn('[Audio] PcmStreamPlayer Native Module nicht verfuegbar');
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
const messageId = payload.messageId || '';
|
||||||
|
const sampleRate = payload.sampleRate || 24000;
|
||||||
|
const channels = payload.channels || 1;
|
||||||
|
const base64 = payload.base64 || '';
|
||||||
|
const isFinal = !!payload.final;
|
||||||
|
|
||||||
|
// Neuer Stream? (messageId Wechsel oder nicht aktiv)
|
||||||
|
if (!this.pcmStreamActive || this.pcmMessageId !== messageId) {
|
||||||
|
// Vorherigen Stream clean beenden (falls da)
|
||||||
|
if (this.pcmStreamActive) {
|
||||||
|
try { await PcmStreamPlayer.stop(); } catch {}
|
||||||
|
// Altes Buffer verwerfen (wurde nicht final — neue Message kam dazwischen)
|
||||||
|
this.pcmBuffer = [];
|
||||||
|
this.pcmBytesCollected = 0;
|
||||||
|
}
|
||||||
|
this.pcmStreamActive = true;
|
||||||
|
this.pcmMessageId = messageId;
|
||||||
|
this.pcmSampleRate = sampleRate;
|
||||||
|
this.pcmChannels = channels;
|
||||||
|
this.pcmBuffer = [];
|
||||||
|
this.pcmBytesCollected = 0;
|
||||||
|
try {
|
||||||
|
await PcmStreamPlayer.start(sampleRate, channels);
|
||||||
|
} catch (err) {
|
||||||
|
console.error('[Audio] PcmStreamPlayer.start fehlgeschlagen:', err);
|
||||||
|
this.pcmStreamActive = false;
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
// Audio-Focus: andere Apps ducken
|
||||||
|
AudioFocus?.requestDuck().catch(() => {});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Chunk abspielen + cachen
|
||||||
|
if (base64) {
|
||||||
|
try { await PcmStreamPlayer.writeChunk(base64); } catch (err) { console.warn('[Audio] writeChunk', err); }
|
||||||
|
// Buffer fuer Cache sammeln (wenn noch nicht zu gross)
|
||||||
|
if (messageId && this.pcmBytesCollected < this.PCM_MAX_CACHE_BYTES) {
|
||||||
|
this.pcmBuffer.push(base64);
|
||||||
|
// 4 base64-chars ≈ 3 bytes — grobe Schaetzung
|
||||||
|
this.pcmBytesCollected += Math.floor(base64.length * 0.75);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isFinal) {
|
||||||
|
// Stream sauber beenden (spielt noch bis Puffer leer ist)
|
||||||
|
try { await PcmStreamPlayer.end(); } catch {}
|
||||||
|
this.pcmStreamActive = false;
|
||||||
|
AudioFocus?.release().catch(() => {});
|
||||||
|
|
||||||
|
// Aus gesammelten PCM-Chunks eine WAV-Datei fuer Replay bauen
|
||||||
|
if (messageId && this.pcmBuffer.length > 0) {
|
||||||
|
const audioPath = await this._savePcmBufferAsWav(messageId);
|
||||||
|
this.pcmBuffer = [];
|
||||||
|
this.pcmBytesCollected = 0;
|
||||||
|
this.pcmMessageId = '';
|
||||||
|
return audioPath;
|
||||||
|
}
|
||||||
|
this.pcmMessageId = '';
|
||||||
|
}
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Gesammelte PCM-Chunks als WAV speichern. Gibt file:// Pfad zurueck. */
|
||||||
|
private async _savePcmBufferAsWav(messageId: string): Promise<string> {
|
||||||
|
try {
|
||||||
|
const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`;
|
||||||
|
await RNFS.mkdir(dir).catch(() => {});
|
||||||
|
const path = `${dir}/${messageId}.wav`;
|
||||||
|
|
||||||
|
// WAV-Header fuer PCM s16le
|
||||||
|
const sampleRate = this.pcmSampleRate;
|
||||||
|
const channels = this.pcmChannels;
|
||||||
|
const bitsPerSample = 16;
|
||||||
|
const byteRate = sampleRate * channels * bitsPerSample / 8;
|
||||||
|
const blockAlign = channels * bitsPerSample / 8;
|
||||||
|
const dataSize = this.pcmBytesCollected;
|
||||||
|
const fileSize = 36 + dataSize;
|
||||||
|
|
||||||
|
// Header als Base64 (44 bytes)
|
||||||
|
const header = new Uint8Array(44);
|
||||||
|
const dv = new DataView(header.buffer);
|
||||||
|
// "RIFF"
|
||||||
|
header[0] = 0x52; header[1] = 0x49; header[2] = 0x46; header[3] = 0x46;
|
||||||
|
dv.setUint32(4, fileSize, true);
|
||||||
|
// "WAVE"
|
||||||
|
header[8] = 0x57; header[9] = 0x41; header[10] = 0x56; header[11] = 0x45;
|
||||||
|
// "fmt "
|
||||||
|
header[12] = 0x66; header[13] = 0x6d; header[14] = 0x74; header[15] = 0x20;
|
||||||
|
dv.setUint32(16, 16, true); // fmt chunk size
|
||||||
|
dv.setUint16(20, 1, true); // PCM format
|
||||||
|
dv.setUint16(22, channels, true);
|
||||||
|
dv.setUint32(24, sampleRate, true);
|
||||||
|
dv.setUint32(28, byteRate, true);
|
||||||
|
dv.setUint16(32, blockAlign, true);
|
||||||
|
dv.setUint16(34, bitsPerSample, true);
|
||||||
|
// "data"
|
||||||
|
header[36] = 0x64; header[37] = 0x61; header[38] = 0x74; header[39] = 0x61;
|
||||||
|
dv.setUint32(40, dataSize, true);
|
||||||
|
|
||||||
|
// Header als base64
|
||||||
|
let headerB64 = '';
|
||||||
|
const chunk = 1024;
|
||||||
|
for (let i = 0; i < header.length; i += chunk) {
|
||||||
|
headerB64 += String.fromCharCode(...Array.from(header.slice(i, i + chunk)));
|
||||||
|
}
|
||||||
|
headerB64 = btoaSafe(headerB64);
|
||||||
|
|
||||||
|
// Datei schreiben: Header + alle PCM-Chunks
|
||||||
|
await RNFS.writeFile(path, headerB64, 'base64');
|
||||||
|
for (const b64 of this.pcmBuffer) {
|
||||||
|
await RNFS.appendFile(path, b64, 'base64');
|
||||||
|
}
|
||||||
|
console.log(`[Audio] PCM-Cache geschrieben: ${path} (${(dataSize / 1024).toFixed(0)}KB, ${this.pcmBuffer.length} chunks)`);
|
||||||
|
return `file://${path}`;
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('[Audio] _savePcmBufferAsWav fehlgeschlagen:', err);
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Audio aus lokaler Datei (file:// Pfad) in die Queue und abspielen. */
|
/** Audio aus lokaler Datei (file:// Pfad) in die Queue und abspielen. */
|
||||||
async playFromPath(filePath: string): Promise<void> {
|
async playFromPath(filePath: string): Promise<void> {
|
||||||
if (!filePath) return;
|
if (!filePath) return;
|
||||||
|
|
@ -419,6 +586,14 @@ class AudioService {
|
||||||
if (this.preloadedPath) RNFS.unlink(this.preloadedPath).catch(() => {});
|
if (this.preloadedPath) RNFS.unlink(this.preloadedPath).catch(() => {});
|
||||||
this.preloadedPath = '';
|
this.preloadedPath = '';
|
||||||
}
|
}
|
||||||
|
// PCM-Stream ebenfalls hart stoppen (Cancel/Abbruch)
|
||||||
|
if (this.pcmStreamActive) {
|
||||||
|
PcmStreamPlayer?.stop().catch(() => {});
|
||||||
|
this.pcmStreamActive = false;
|
||||||
|
this.pcmBuffer = [];
|
||||||
|
this.pcmBytesCollected = 0;
|
||||||
|
this.pcmMessageId = '';
|
||||||
|
}
|
||||||
// Audio-Focus freigeben
|
// Audio-Focus freigeben
|
||||||
AudioFocus?.release().catch(() => {});
|
AudioFocus?.release().catch(() => {});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1296,19 +1296,41 @@ class ARIABridge:
|
||||||
await self._emit_activity("idle", "")
|
await self._emit_activity("idle", "")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
elif msg_type == "audio_pcm":
|
||||||
|
# XTTS-PCM-Stream vom Gaming-PC empfangen → durchleiten zur App.
|
||||||
|
# Wenn in payload kein messageId (alte XTTS-Bridge), aus requestId auflösen.
|
||||||
|
error = payload.get("error", "")
|
||||||
|
if error:
|
||||||
|
logger.warning("[rvs] XTTS PCM-Fehler: %s", error)
|
||||||
|
return
|
||||||
|
linked_message_id = payload.get("messageId", "")
|
||||||
|
if not linked_message_id:
|
||||||
|
req_id_full = payload.get("requestId", "")
|
||||||
|
req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
|
||||||
|
linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
|
||||||
|
# Einfach 1:1 weiterleiten mit eingefuellter messageId
|
||||||
|
forwarded = dict(payload)
|
||||||
|
forwarded["messageId"] = linked_message_id
|
||||||
|
await self._send_to_rvs({
|
||||||
|
"type": "audio_pcm",
|
||||||
|
"payload": forwarded,
|
||||||
|
"timestamp": int(asyncio.get_event_loop().time() * 1000),
|
||||||
|
})
|
||||||
|
return
|
||||||
|
|
||||||
elif msg_type == "xtts_response":
|
elif msg_type == "xtts_response":
|
||||||
# XTTS-Audio vom Gaming-PC empfangen → an App weiterleiten
|
# Legacy-Pfad (alte XTTS-Bridge mit WAV-Response). Weiterleiten als
|
||||||
|
# type "audio" — App nutzt den bestehenden WAV-Queue-Spieler.
|
||||||
audio_b64 = payload.get("base64", "")
|
audio_b64 = payload.get("base64", "")
|
||||||
error = payload.get("error", "")
|
error = payload.get("error", "")
|
||||||
req_id_full = payload.get("requestId", "")
|
req_id_full = payload.get("requestId", "")
|
||||||
# XTTS-Bridge suffixt chunkweise: "uuid_0", "uuid_1" → Basis-UUID extrahieren
|
|
||||||
req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
|
req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
|
||||||
linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
|
linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
|
||||||
if error:
|
if error:
|
||||||
logger.warning("[rvs] XTTS Fehler: %s", error)
|
logger.warning("[rvs] XTTS Fehler: %s", error)
|
||||||
return
|
return
|
||||||
if audio_b64:
|
if audio_b64:
|
||||||
logger.info("[rvs] XTTS-Audio empfangen: %dKB", len(audio_b64) // 1365)
|
logger.info("[rvs] XTTS-Audio legacy empfangen: %dKB", len(audio_b64) // 1365)
|
||||||
await self._send_to_rvs({
|
await self._send_to_rvs({
|
||||||
"type": "audio",
|
"type": "audio",
|
||||||
"payload": {
|
"payload": {
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ const ALLOWED_TYPES = new Set([
|
||||||
"xtts_request", "xtts_response", "xtts_list_voices", "xtts_voices_list", "voice_upload", "xtts_voice_saved",
|
"xtts_request", "xtts_response", "xtts_list_voices", "xtts_voices_list", "voice_upload", "xtts_voice_saved",
|
||||||
"update_check", "update_available", "update_download", "update_data",
|
"update_check", "update_available", "update_download", "update_data",
|
||||||
"agent_activity", "cancel_request",
|
"agent_activity", "cancel_request",
|
||||||
|
"audio_pcm",
|
||||||
]);
|
]);
|
||||||
|
|
||||||
// Token-Raum: token -> { clients: Set<ws> }
|
// Token-Raum: token -> { clients: Set<ws> }
|
||||||
|
|
|
||||||
168
xtts/bridge.js
168
xtts/bridge.js
|
|
@ -94,34 +94,33 @@ function connectRVS(forcePlain) {
|
||||||
// ── TTS Request Handler ─────────────────────────────
|
// ── TTS Request Handler ─────────────────────────────
|
||||||
|
|
||||||
async function handleTTSRequest(payload) {
|
async function handleTTSRequest(payload) {
|
||||||
const { text, voice, requestId, language } = payload;
|
const { text, voice, requestId, language, messageId } = payload;
|
||||||
if (!text) return;
|
if (!text) return;
|
||||||
|
|
||||||
// Markdown + Sonderzeichen entfernen fuer natuerliche Sprache
|
// Markdown-Cleanup (Bridge macht jetzt auch Cleanup, aber safety net)
|
||||||
let cleanText = text
|
let cleanText = text
|
||||||
.replace(/\*\*([^*]+)\*\*/g, "$1") // **fett** → fett
|
.replace(/\*\*([^*]+)\*\*/g, "$1")
|
||||||
.replace(/\*([^*]+)\*/g, "$1") // *kursiv* → kursiv
|
.replace(/\*([^*]+)\*/g, "$1")
|
||||||
.replace(/`([^`]+)`/g, "$1") // `code` → code
|
.replace(/`([^`]+)`/g, "$1")
|
||||||
.replace(/```[\s\S]*?```/g, "") // Code-Bloecke entfernen
|
.replace(/```[\s\S]*?```/g, "")
|
||||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") // [text](url) → text
|
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
|
||||||
.replace(/#{1,6}\s*/g, "") // ### Ueberschriften → entfernen
|
.replace(/#{1,6}\s*/g, "")
|
||||||
.replace(/>\s*/g, "") // > Zitate → entfernen
|
.replace(/>\s*/g, "")
|
||||||
.replace(/[-*]\s+/g, "") // - Listen → entfernen
|
.replace(/[-*]\s+/g, "")
|
||||||
.replace(/\n{2,}/g, ". ") // Mehrere Newlines → Punkt
|
.replace(/\n{2,}/g, ". ")
|
||||||
.replace(/\n/g, ", ") // Einzelne Newlines → Komma
|
.replace(/\n/g, ", ")
|
||||||
.replace(/\s{2,}/g, " ") // Mehrfach-Leerzeichen
|
.replace(/\s{2,}/g, " ")
|
||||||
.replace(/["""„]/g, "") // Anfuehrungszeichen entfernen
|
.replace(/["""„]/g, "")
|
||||||
.replace(/\(\)/g, "") // Leere Klammern
|
.replace(/\(\)/g, "")
|
||||||
.trim();
|
.trim();
|
||||||
|
|
||||||
// Text in Saetze aufteilen, dann zu Chunks von 2-3 Saetzen zusammenfassen
|
// Satzweise Chunks (XTTS Modell laedt Context pro Call — Saetze gruppieren)
|
||||||
// (mehr Kontext = konsistentere Stimme/Lautstaerke, aber nicht zu lang fuer WebSocket)
|
|
||||||
const sentences = cleanText.split(/(?<=[.!?])\s+/)
|
const sentences = cleanText.split(/(?<=[.!?])\s+/)
|
||||||
.map(s => s.trim())
|
.map(s => s.trim())
|
||||||
.filter(s => s.length > 0)
|
.filter(s => s.length > 0)
|
||||||
.map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen
|
.map(s => s.replace(/[.]+$/, ''));
|
||||||
|
|
||||||
const MAX_CHUNK_CHARS = 150; // Max ~150 Zeichen pro Chunk (schnelles Rendering, Preloading reicht)
|
const MAX_CHUNK_CHARS = 150;
|
||||||
const chunks = [];
|
const chunks = [];
|
||||||
let currentChunk = '';
|
let currentChunk = '';
|
||||||
for (const sentence of sentences) {
|
for (const sentence of sentences) {
|
||||||
|
|
@ -135,45 +134,70 @@ async function handleTTSRequest(payload) {
|
||||||
if (currentChunk) chunks.push(currentChunk);
|
if (currentChunk) chunks.push(currentChunk);
|
||||||
if (chunks.length === 0) return;
|
if (chunks.length === 0) return;
|
||||||
|
|
||||||
log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze → ${chunks.length} Chunks, voice: ${voice || "default"}, lang: ${language || "de"})`);
|
log(`TTS-Request (streaming): "${cleanText.slice(0, 60)}..." (${chunks.length} Chunks, voice: ${voice || "default"})`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
||||||
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
||||||
|
|
||||||
// Streaming: Chunk rendern → sofort senden → naechster Chunk
|
let chunkIndex = 0;
|
||||||
// App spielt mit Preloading-Queue nahtlos ab
|
// Audio-Format (aus WAV-Header extrahiert, einmal pro Request)
|
||||||
let sentCount = 0;
|
let pcmMeta = null;
|
||||||
|
|
||||||
for (let i = 0; i < chunks.length; i++) {
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
const chunk = chunks[i];
|
const chunk = chunks[i];
|
||||||
|
const isLastChunk = i === chunks.length - 1;
|
||||||
try {
|
try {
|
||||||
const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null);
|
// Streaming: PCM-Frames werden nacheinander an RVS gepusht,
|
||||||
|
// sobald sie vom XTTS-Server reinkommen
|
||||||
if (audioBuffer && audioBuffer.length > 100) {
|
await streamXTTSAsPCM(
|
||||||
log(`TTS [${i + 1}/${chunks.length}]: ${(audioBuffer.length / 1024).toFixed(0)}KB — "${chunk.slice(0, 50)}"`);
|
chunk,
|
||||||
|
language || "de",
|
||||||
|
hasCustomVoice ? voiceSample : null,
|
||||||
|
(pcmBase64, meta) => {
|
||||||
|
if (!pcmMeta) pcmMeta = meta;
|
||||||
|
sendToRVS({
|
||||||
|
type: "audio_pcm",
|
||||||
|
payload: {
|
||||||
|
requestId: requestId || "",
|
||||||
|
messageId: messageId || "",
|
||||||
|
base64: pcmBase64,
|
||||||
|
format: "pcm_s16le",
|
||||||
|
sampleRate: meta.sampleRate,
|
||||||
|
channels: meta.channels,
|
||||||
|
voice: voice || "default",
|
||||||
|
chunk: chunkIndex++,
|
||||||
|
final: false,
|
||||||
|
},
|
||||||
|
timestamp: Date.now(),
|
||||||
|
});
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
// Nach letztem Text-Chunk: final-Flag senden damit App weiss "fertig"
|
||||||
|
if (isLastChunk && pcmMeta) {
|
||||||
sendToRVS({
|
sendToRVS({
|
||||||
type: "xtts_response",
|
type: "audio_pcm",
|
||||||
payload: {
|
payload: {
|
||||||
requestId: `${requestId || ""}_${i}`,
|
requestId: requestId || "",
|
||||||
base64: audioBuffer.toString("base64"),
|
messageId: messageId || "",
|
||||||
mimeType: "audio/wav",
|
base64: "",
|
||||||
|
format: "pcm_s16le",
|
||||||
|
sampleRate: pcmMeta.sampleRate,
|
||||||
|
channels: pcmMeta.channels,
|
||||||
voice: voice || "default",
|
voice: voice || "default",
|
||||||
engine: "xtts",
|
chunk: chunkIndex++,
|
||||||
part: i + 1,
|
final: true,
|
||||||
totalParts: chunks.length,
|
|
||||||
},
|
},
|
||||||
timestamp: Date.now(),
|
timestamp: Date.now(),
|
||||||
});
|
});
|
||||||
sentCount++;
|
|
||||||
}
|
}
|
||||||
} catch (chunkErr) {
|
} catch (chunkErr) {
|
||||||
log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
|
log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log(`TTS komplett: ${sentCount}/${chunks.length} Chunks gestreamt`);
|
log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
log(`TTS Fehler: ${err.message}`);
|
log(`TTS Fehler: ${err.message}`);
|
||||||
sendToRVS({
|
sendToRVS({
|
||||||
|
|
@ -184,7 +208,19 @@ async function handleTTSRequest(payload) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function callXTTSAPI(text, language, speakerWav) {
|
/**
|
||||||
|
* Ruft /tts_to_audio/ auf und streamt das resultierende WAV bereits waehrend
|
||||||
|
* des Empfangs in PCM-Frames an den Callback. Der WAV-Header wird einmal
|
||||||
|
* geparst, danach werden nur noch raw PCM-Samples weitergeleitet.
|
||||||
|
*
|
||||||
|
* Warum nicht echtes /tts_stream/? daswer123 hat den Endpoint, aber die
|
||||||
|
* Audio-Quality ist dort niedriger und er produziert beim ersten Chunk
|
||||||
|
* oft Artefakte. Pragmatischer Weg: /tts_to_audio/ + Response-Stream
|
||||||
|
* chunkweise auslesen. Das ist zwar kein echtes Server-Streaming, aber
|
||||||
|
* gibt uns deutlich kleinere Netzwerk-Haeppchen und die App kann via
|
||||||
|
* AudioTrack MODE_STREAM sofort nahtlos abspielen.
|
||||||
|
*/
|
||||||
|
function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
const body = JSON.stringify({
|
const body = JSON.stringify({
|
||||||
text,
|
text,
|
||||||
|
|
@ -206,15 +242,59 @@ function callXTTSAPI(text, language, speakerWav) {
|
||||||
};
|
};
|
||||||
|
|
||||||
const req = http.request(options, (res) => {
|
const req = http.request(options, (res) => {
|
||||||
const chunks = [];
|
if (res.statusCode !== 200) {
|
||||||
res.on("data", (chunk) => chunks.push(chunk));
|
let body = "";
|
||||||
res.on("end", () => {
|
res.on("data", (d) => { body += d.toString(); });
|
||||||
if (res.statusCode === 200) {
|
res.on("end", () => reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`)));
|
||||||
resolve(Buffer.concat(chunks));
|
return;
|
||||||
} else {
|
}
|
||||||
reject(new Error(`XTTS API HTTP ${res.statusCode}: ${Buffer.concat(chunks).toString().slice(0, 200)}`));
|
|
||||||
|
let headerParsed = false;
|
||||||
|
let sampleRate = 24000;
|
||||||
|
let channels = 1;
|
||||||
|
let leftover = Buffer.alloc(0); // ungerade Byte-Reste fuer das naechste Chunk
|
||||||
|
const HEADER_BYTES = 44;
|
||||||
|
let headerBuf = Buffer.alloc(0);
|
||||||
|
const PCM_CHUNK_BYTES = 8192; // ~170ms bei 24kHz s16 mono
|
||||||
|
|
||||||
|
res.on("data", (chunk) => {
|
||||||
|
let data = chunk;
|
||||||
|
|
||||||
|
// WAV-Header konsumieren (44 Bytes)
|
||||||
|
if (!headerParsed) {
|
||||||
|
headerBuf = Buffer.concat([headerBuf, data]);
|
||||||
|
if (headerBuf.length < HEADER_BYTES) return;
|
||||||
|
// Header lesen
|
||||||
|
const header = headerBuf.slice(0, HEADER_BYTES);
|
||||||
|
try {
|
||||||
|
channels = header.readUInt16LE(22);
|
||||||
|
sampleRate = header.readUInt32LE(24);
|
||||||
|
} catch (_) {}
|
||||||
|
headerParsed = true;
|
||||||
|
data = headerBuf.slice(HEADER_BYTES);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// leftover aus vorherigem Chunk + neuer data
|
||||||
|
let combined = Buffer.concat([leftover, data]);
|
||||||
|
|
||||||
|
// In PCM_CHUNK_BYTES-Happen zerlegen (Vielfache von 2 damit keine Sample-Splits)
|
||||||
|
while (combined.length >= PCM_CHUNK_BYTES) {
|
||||||
|
const slice = combined.slice(0, PCM_CHUNK_BYTES);
|
||||||
|
combined = combined.slice(PCM_CHUNK_BYTES);
|
||||||
|
onPcmChunk(slice.toString("base64"), { sampleRate, channels });
|
||||||
|
}
|
||||||
|
leftover = combined;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
res.on("end", () => {
|
||||||
|
// Rest-Daten senden
|
||||||
|
if (leftover.length > 0) {
|
||||||
|
onPcmChunk(leftover.toString("base64"), { sampleRate, channels });
|
||||||
|
}
|
||||||
|
resolve();
|
||||||
|
});
|
||||||
|
|
||||||
|
res.on("error", reject);
|
||||||
});
|
});
|
||||||
|
|
||||||
req.on("error", reject);
|
req.on("error", reject);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue