feat: Streaming TTS — PCM-Stream statt WAV-Chunks (Weg A)
Pipeline: XTTS-Server → xtts-bridge → aria-bridge → RVS → App AudioTrack XTTS-Bridge (Gaming-PC): - streamXTTSAsPCM(): liest /tts_to_audio/ Response inkrementell, parst WAV-Header (samplerate/channels), teilt PCM in 8KB-Chunks (~170ms bei 24kHz s16 mono) und sendet jeden als audio_pcm. - Finaler Chunk mit final=true nach letztem Text-Chunk aria-bridge: - audio_pcm Handler leitet payload 1:1 weiter, filled messageId aus requestId → messageId Map falls XTTS-Bridge messageId nicht hatte - Alter xtts_response Pfad bleibt als Legacy-Fallback (WAV) RVS: audio_pcm in ALLOWED_TYPES Android Native: - PcmStreamPlayerModule (Kotlin): AudioTrack MODE_STREAM mit Writer-Thread und BlockingQueue. start(rate, ch) / writeChunk(b64) / end() / stop() - 8x MinBufferSize grosszuegig dimensioniert, glatt auch bei Netz-Aussetzern - Registered im MainApplication via PcmStreamPlayerPackage App JS: - audioService.handlePcmChunk(): erkennt neue Session (messageId-Wechsel), started nativen Stream, cached PCM-Bytes pro Message. Bei final=true Stream sauber schliessen + _savePcmBufferAsWav → WAV-File im tts_cache/<messageId>.wav - _savePcmBufferAsWav: baut 44-byte WAV-Header (PCM s16le, korrekte samplerate/channels), haengt alle gesammelten base64-PCM-Chunks an - stopPlayback beendet auch aktiven PCM-Stream - ChatScreen routet type=audio_pcm an handlePcmChunk, bei final setzt audioPath in der Message Play-Button: falls messageId einen audioPath hat → WAV aus Cache (Sound-basiert), egal ob Original-TTS Piper oder XTTS war. Audio-Focus: - requestDuck() beim Stream-Start, release() bei Stream-Ende - Andere Apps (Spotify etc.) werden leiser waehrend ARIA spricht Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
eb12281dfc
commit
6ab6196739
|
|
@ -20,6 +20,7 @@ class MainApplication : Application(), ReactApplication {
|
|||
PackageList(this).packages.apply {
|
||||
add(ApkInstallerPackage())
|
||||
add(AudioFocusPackage())
|
||||
add(PcmStreamPlayerPackage())
|
||||
}
|
||||
|
||||
override fun getJSMainModuleName(): String = "index"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,158 @@
|
|||
package com.ariacockpit
|
||||
|
||||
import android.media.AudioAttributes
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioManager
|
||||
import android.media.AudioTrack
|
||||
import android.util.Base64
|
||||
import android.util.Log
|
||||
import com.facebook.react.bridge.Promise
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.bridge.ReactContextBaseJavaModule
|
||||
import com.facebook.react.bridge.ReactMethod
|
||||
import java.util.concurrent.LinkedBlockingQueue
|
||||
|
||||
/**
|
||||
* Streamt PCM-s16le Audio direkt via AudioTrack MODE_STREAM.
|
||||
*
|
||||
* Flow:
|
||||
* JS: start(sampleRate, channels) → öffnet AudioTrack und startet Writer-Thread
|
||||
* JS: writeChunk(base64) → dekodiert, queued, Writer schreibt non-blocking
|
||||
* JS: end() → wartet bis Queue leer, schließt AudioTrack
|
||||
* JS: stop() → Hart stoppen, Queue leeren (Cancel)
|
||||
*
|
||||
* Vorteil gegenüber Sound-File-Queue:
|
||||
* - Keine Gap zwischen Chunks (AudioTrack puffert intern)
|
||||
* - Erste Samples beginnen zu spielen sobald der erste Chunk da ist
|
||||
* - Kein WAV-Header-Parsing pro Chunk
|
||||
*/
|
||||
class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||
companion object {
|
||||
private const val TAG = "PcmStreamPlayer"
|
||||
}
|
||||
|
||||
override fun getName() = "PcmStreamPlayer"
|
||||
|
||||
private var track: AudioTrack? = null
|
||||
private val queue = LinkedBlockingQueue<ByteArray>()
|
||||
private var writerThread: Thread? = null
|
||||
@Volatile private var writerShouldStop = false
|
||||
@Volatile private var endRequested = false
|
||||
|
||||
// ── Lifecycle ──
|
||||
|
||||
@ReactMethod
|
||||
fun start(sampleRate: Int, channels: Int, promise: Promise) {
|
||||
try {
|
||||
// Alte Session beenden falls vorhanden
|
||||
stopInternal()
|
||||
|
||||
val channelConfig = if (channels == 2) AudioFormat.CHANNEL_OUT_STEREO else AudioFormat.CHANNEL_OUT_MONO
|
||||
val encoding = AudioFormat.ENCODING_PCM_16BIT
|
||||
val minBuf = AudioTrack.getMinBufferSize(sampleRate, channelConfig, encoding)
|
||||
// Etwas grosszuegiger Buffer: 8x MinSize (ca. 200-400ms bei 24kHz) — glatt auch bei kleinen Netzwerk-Aussetzern
|
||||
val bufferSize = (minBuf * 8).coerceAtLeast(32 * 1024)
|
||||
|
||||
val newTrack = AudioTrack.Builder()
|
||||
.setAudioAttributes(
|
||||
AudioAttributes.Builder()
|
||||
.setUsage(AudioAttributes.USAGE_ASSISTANT)
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.build(),
|
||||
)
|
||||
.setAudioFormat(
|
||||
AudioFormat.Builder()
|
||||
.setSampleRate(sampleRate)
|
||||
.setChannelMask(channelConfig)
|
||||
.setEncoding(encoding)
|
||||
.build(),
|
||||
)
|
||||
.setBufferSizeInBytes(bufferSize)
|
||||
.setTransferMode(AudioTrack.MODE_STREAM)
|
||||
.build()
|
||||
|
||||
newTrack.play()
|
||||
track = newTrack
|
||||
queue.clear()
|
||||
writerShouldStop = false
|
||||
endRequested = false
|
||||
|
||||
writerThread = Thread({
|
||||
val t = track ?: return@Thread
|
||||
try {
|
||||
while (!writerShouldStop) {
|
||||
val data = queue.poll(50, java.util.concurrent.TimeUnit.MILLISECONDS) ?: run {
|
||||
if (endRequested) return@Thread
|
||||
null
|
||||
} ?: continue
|
||||
var offset = 0
|
||||
while (offset < data.size && !writerShouldStop) {
|
||||
val written = t.write(data, offset, data.size - offset)
|
||||
if (written <= 0) break
|
||||
offset += written
|
||||
}
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "Writer-Thread Fehler: ${e.message}")
|
||||
} finally {
|
||||
try { t.stop() } catch (_: Exception) {}
|
||||
try { t.release() } catch (_: Exception) {}
|
||||
}
|
||||
}, "PcmStreamWriter").apply { start() }
|
||||
|
||||
Log.i(TAG, "Stream gestartet: ${sampleRate}Hz ch=$channels buf=${bufferSize}B")
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "start fehlgeschlagen", e)
|
||||
promise.reject("START_FAILED", e.message, e)
|
||||
}
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun writeChunk(base64Pcm: String, promise: Promise) {
|
||||
try {
|
||||
if (base64Pcm.isEmpty()) {
|
||||
promise.resolve(true)
|
||||
return
|
||||
}
|
||||
val bytes = Base64.decode(base64Pcm, Base64.DEFAULT)
|
||||
queue.put(bytes)
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
promise.reject("WRITE_FAILED", e.message, e)
|
||||
}
|
||||
}
|
||||
|
||||
/** Signalisiert: keine weiteren Chunks. Writer wartet auf Queue-Abschluss, dann stoppt. */
|
||||
@ReactMethod
|
||||
fun end(promise: Promise) {
|
||||
endRequested = true
|
||||
promise.resolve(true)
|
||||
}
|
||||
|
||||
/** Harter Stop (Cancel) — Queue verwerfen. */
|
||||
@ReactMethod
|
||||
fun stop(promise: Promise) {
|
||||
stopInternal()
|
||||
promise.resolve(true)
|
||||
}
|
||||
|
||||
private fun stopInternal() {
|
||||
writerShouldStop = true
|
||||
endRequested = true
|
||||
queue.clear()
|
||||
writerThread?.interrupt()
|
||||
writerThread = null
|
||||
val t = track
|
||||
if (t != null) {
|
||||
try { t.stop() } catch (_: Exception) {}
|
||||
try { t.release() } catch (_: Exception) {}
|
||||
}
|
||||
track = null
|
||||
}
|
||||
|
||||
override fun onCatalystInstanceDestroy() {
|
||||
stopInternal()
|
||||
super.onCatalystInstanceDestroy()
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
package com.ariacockpit
|
||||
|
||||
import com.facebook.react.ReactPackage
|
||||
import com.facebook.react.bridge.NativeModule
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.uimanager.ViewManager
|
||||
|
||||
class PcmStreamPlayerPackage : ReactPackage {
|
||||
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
|
||||
return listOf(PcmStreamPlayerModule(reactContext))
|
||||
}
|
||||
|
||||
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
|
||||
return emptyList()
|
||||
}
|
||||
}
|
||||
|
|
@ -274,6 +274,20 @@ const ChatScreen: React.FC = () => {
|
|||
}
|
||||
}
|
||||
|
||||
// XTTS PCM-Stream: direkt an AudioTrack, bei final WAV-Cache schreiben
|
||||
if (message.type === ('audio_pcm' as any)) {
|
||||
const p = message.payload as any;
|
||||
const refId = (p.messageId as string) || '';
|
||||
audioService.handlePcmChunk(p).then((audioPath: any) => {
|
||||
// Wenn final + Cache-Pfad zurueckkam, Message aktualisieren
|
||||
if (p.final && audioPath && refId) {
|
||||
setMessages(prev => prev.map(m =>
|
||||
m.messageId === refId ? { ...m, audioPath } : m
|
||||
));
|
||||
}
|
||||
}).catch(() => {});
|
||||
}
|
||||
|
||||
// Thinking-Indicator Status von der Bridge
|
||||
if (message.type === 'agent_activity') {
|
||||
const activity = (message.payload.activity as string) || 'idle';
|
||||
|
|
|
|||
|
|
@ -16,13 +16,36 @@ import AudioRecorderPlayer, {
|
|||
OutputFormatAndroidType,
|
||||
} from 'react-native-audio-recorder-player';
|
||||
|
||||
// Base64-Encoder fuer Binary-Strings (Header-Bytes → Base64)
|
||||
const B64_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';
|
||||
function btoaSafe(bin: string): string {
|
||||
let out = '';
|
||||
const len = bin.length;
|
||||
for (let i = 0; i < len; i += 3) {
|
||||
const b1 = bin.charCodeAt(i) & 0xff;
|
||||
const b2 = i + 1 < len ? bin.charCodeAt(i + 1) & 0xff : 0;
|
||||
const b3 = i + 2 < len ? bin.charCodeAt(i + 2) & 0xff : 0;
|
||||
out += B64_CHARS[b1 >> 2];
|
||||
out += B64_CHARS[((b1 & 0x03) << 4) | (b2 >> 4)];
|
||||
out += i + 1 < len ? B64_CHARS[((b2 & 0x0f) << 2) | (b3 >> 6)] : '=';
|
||||
out += i + 2 < len ? B64_CHARS[b3 & 0x3f] : '=';
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// Native Module fuer Audio-Focus (Ducking/Muten anderer Apps)
|
||||
const { AudioFocus } = NativeModules as {
|
||||
const { AudioFocus, PcmStreamPlayer } = NativeModules as {
|
||||
AudioFocus?: {
|
||||
requestDuck: () => Promise<boolean>;
|
||||
requestExclusive: () => Promise<boolean>;
|
||||
release: () => Promise<boolean>;
|
||||
};
|
||||
PcmStreamPlayer?: {
|
||||
start: (sampleRate: number, channels: number) => Promise<boolean>;
|
||||
writeChunk: (base64Pcm: string) => Promise<boolean>;
|
||||
end: () => Promise<boolean>;
|
||||
stop: () => Promise<boolean>;
|
||||
};
|
||||
};
|
||||
|
||||
// --- Typen ---
|
||||
|
|
@ -79,6 +102,15 @@ class AudioService {
|
|||
private speechDetected: boolean = false;
|
||||
private speechStartTime: number = 0;
|
||||
|
||||
// PCM-Stream (XTTS): aktive Session + Cache-Puffer pro messageId
|
||||
private pcmStreamActive: boolean = false;
|
||||
private pcmMessageId: string = '';
|
||||
private pcmSampleRate: number = 24000;
|
||||
private pcmChannels: number = 1;
|
||||
private pcmBuffer: string[] = []; // base64-chunks zum spaeteren WAV-Build
|
||||
private pcmBytesCollected: number = 0;
|
||||
private readonly PCM_MAX_CACHE_BYTES = 30 * 1024 * 1024; // 30MB
|
||||
|
||||
// VAD State
|
||||
private vadEnabled: boolean = false;
|
||||
private lastSpeechTime: number = 0;
|
||||
|
|
@ -303,6 +335,141 @@ class AudioService {
|
|||
}
|
||||
}
|
||||
|
||||
/** Einen PCM-Chunk aus einer audio_pcm Nachricht empfangen und spielen/cachen.
|
||||
* Gibt bei final=true den Cache-Pfad zurueck (file://) oder '' wenn nicht gecached. */
|
||||
async handlePcmChunk(payload: {
|
||||
base64: string;
|
||||
sampleRate?: number;
|
||||
channels?: number;
|
||||
messageId?: string;
|
||||
chunk?: number;
|
||||
final?: boolean;
|
||||
}): Promise<string> {
|
||||
if (!PcmStreamPlayer) {
|
||||
console.warn('[Audio] PcmStreamPlayer Native Module nicht verfuegbar');
|
||||
return '';
|
||||
}
|
||||
|
||||
const messageId = payload.messageId || '';
|
||||
const sampleRate = payload.sampleRate || 24000;
|
||||
const channels = payload.channels || 1;
|
||||
const base64 = payload.base64 || '';
|
||||
const isFinal = !!payload.final;
|
||||
|
||||
// Neuer Stream? (messageId Wechsel oder nicht aktiv)
|
||||
if (!this.pcmStreamActive || this.pcmMessageId !== messageId) {
|
||||
// Vorherigen Stream clean beenden (falls da)
|
||||
if (this.pcmStreamActive) {
|
||||
try { await PcmStreamPlayer.stop(); } catch {}
|
||||
// Altes Buffer verwerfen (wurde nicht final — neue Message kam dazwischen)
|
||||
this.pcmBuffer = [];
|
||||
this.pcmBytesCollected = 0;
|
||||
}
|
||||
this.pcmStreamActive = true;
|
||||
this.pcmMessageId = messageId;
|
||||
this.pcmSampleRate = sampleRate;
|
||||
this.pcmChannels = channels;
|
||||
this.pcmBuffer = [];
|
||||
this.pcmBytesCollected = 0;
|
||||
try {
|
||||
await PcmStreamPlayer.start(sampleRate, channels);
|
||||
} catch (err) {
|
||||
console.error('[Audio] PcmStreamPlayer.start fehlgeschlagen:', err);
|
||||
this.pcmStreamActive = false;
|
||||
return '';
|
||||
}
|
||||
// Audio-Focus: andere Apps ducken
|
||||
AudioFocus?.requestDuck().catch(() => {});
|
||||
}
|
||||
|
||||
// Chunk abspielen + cachen
|
||||
if (base64) {
|
||||
try { await PcmStreamPlayer.writeChunk(base64); } catch (err) { console.warn('[Audio] writeChunk', err); }
|
||||
// Buffer fuer Cache sammeln (wenn noch nicht zu gross)
|
||||
if (messageId && this.pcmBytesCollected < this.PCM_MAX_CACHE_BYTES) {
|
||||
this.pcmBuffer.push(base64);
|
||||
// 4 base64-chars ≈ 3 bytes — grobe Schaetzung
|
||||
this.pcmBytesCollected += Math.floor(base64.length * 0.75);
|
||||
}
|
||||
}
|
||||
|
||||
if (isFinal) {
|
||||
// Stream sauber beenden (spielt noch bis Puffer leer ist)
|
||||
try { await PcmStreamPlayer.end(); } catch {}
|
||||
this.pcmStreamActive = false;
|
||||
AudioFocus?.release().catch(() => {});
|
||||
|
||||
// Aus gesammelten PCM-Chunks eine WAV-Datei fuer Replay bauen
|
||||
if (messageId && this.pcmBuffer.length > 0) {
|
||||
const audioPath = await this._savePcmBufferAsWav(messageId);
|
||||
this.pcmBuffer = [];
|
||||
this.pcmBytesCollected = 0;
|
||||
this.pcmMessageId = '';
|
||||
return audioPath;
|
||||
}
|
||||
this.pcmMessageId = '';
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
/** Gesammelte PCM-Chunks als WAV speichern. Gibt file:// Pfad zurueck. */
|
||||
private async _savePcmBufferAsWav(messageId: string): Promise<string> {
|
||||
try {
|
||||
const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`;
|
||||
await RNFS.mkdir(dir).catch(() => {});
|
||||
const path = `${dir}/${messageId}.wav`;
|
||||
|
||||
// WAV-Header fuer PCM s16le
|
||||
const sampleRate = this.pcmSampleRate;
|
||||
const channels = this.pcmChannels;
|
||||
const bitsPerSample = 16;
|
||||
const byteRate = sampleRate * channels * bitsPerSample / 8;
|
||||
const blockAlign = channels * bitsPerSample / 8;
|
||||
const dataSize = this.pcmBytesCollected;
|
||||
const fileSize = 36 + dataSize;
|
||||
|
||||
// Header als Base64 (44 bytes)
|
||||
const header = new Uint8Array(44);
|
||||
const dv = new DataView(header.buffer);
|
||||
// "RIFF"
|
||||
header[0] = 0x52; header[1] = 0x49; header[2] = 0x46; header[3] = 0x46;
|
||||
dv.setUint32(4, fileSize, true);
|
||||
// "WAVE"
|
||||
header[8] = 0x57; header[9] = 0x41; header[10] = 0x56; header[11] = 0x45;
|
||||
// "fmt "
|
||||
header[12] = 0x66; header[13] = 0x6d; header[14] = 0x74; header[15] = 0x20;
|
||||
dv.setUint32(16, 16, true); // fmt chunk size
|
||||
dv.setUint16(20, 1, true); // PCM format
|
||||
dv.setUint16(22, channels, true);
|
||||
dv.setUint32(24, sampleRate, true);
|
||||
dv.setUint32(28, byteRate, true);
|
||||
dv.setUint16(32, blockAlign, true);
|
||||
dv.setUint16(34, bitsPerSample, true);
|
||||
// "data"
|
||||
header[36] = 0x64; header[37] = 0x61; header[38] = 0x74; header[39] = 0x61;
|
||||
dv.setUint32(40, dataSize, true);
|
||||
|
||||
// Header als base64
|
||||
let headerB64 = '';
|
||||
const chunk = 1024;
|
||||
for (let i = 0; i < header.length; i += chunk) {
|
||||
headerB64 += String.fromCharCode(...Array.from(header.slice(i, i + chunk)));
|
||||
}
|
||||
headerB64 = btoaSafe(headerB64);
|
||||
|
||||
// Datei schreiben: Header + alle PCM-Chunks
|
||||
await RNFS.writeFile(path, headerB64, 'base64');
|
||||
for (const b64 of this.pcmBuffer) {
|
||||
await RNFS.appendFile(path, b64, 'base64');
|
||||
}
|
||||
console.log(`[Audio] PCM-Cache geschrieben: ${path} (${(dataSize / 1024).toFixed(0)}KB, ${this.pcmBuffer.length} chunks)`);
|
||||
return `file://${path}`;
|
||||
} catch (err) {
|
||||
console.warn('[Audio] _savePcmBufferAsWav fehlgeschlagen:', err);
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
/** Audio aus lokaler Datei (file:// Pfad) in die Queue und abspielen. */
|
||||
async playFromPath(filePath: string): Promise<void> {
|
||||
if (!filePath) return;
|
||||
|
|
@ -419,6 +586,14 @@ class AudioService {
|
|||
if (this.preloadedPath) RNFS.unlink(this.preloadedPath).catch(() => {});
|
||||
this.preloadedPath = '';
|
||||
}
|
||||
// PCM-Stream ebenfalls hart stoppen (Cancel/Abbruch)
|
||||
if (this.pcmStreamActive) {
|
||||
PcmStreamPlayer?.stop().catch(() => {});
|
||||
this.pcmStreamActive = false;
|
||||
this.pcmBuffer = [];
|
||||
this.pcmBytesCollected = 0;
|
||||
this.pcmMessageId = '';
|
||||
}
|
||||
// Audio-Focus freigeben
|
||||
AudioFocus?.release().catch(() => {});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1296,19 +1296,41 @@ class ARIABridge:
|
|||
await self._emit_activity("idle", "")
|
||||
return
|
||||
|
||||
elif msg_type == "audio_pcm":
|
||||
# XTTS-PCM-Stream vom Gaming-PC empfangen → durchleiten zur App.
|
||||
# Wenn in payload kein messageId (alte XTTS-Bridge), aus requestId auflösen.
|
||||
error = payload.get("error", "")
|
||||
if error:
|
||||
logger.warning("[rvs] XTTS PCM-Fehler: %s", error)
|
||||
return
|
||||
linked_message_id = payload.get("messageId", "")
|
||||
if not linked_message_id:
|
||||
req_id_full = payload.get("requestId", "")
|
||||
req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
|
||||
linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
|
||||
# Einfach 1:1 weiterleiten mit eingefuellter messageId
|
||||
forwarded = dict(payload)
|
||||
forwarded["messageId"] = linked_message_id
|
||||
await self._send_to_rvs({
|
||||
"type": "audio_pcm",
|
||||
"payload": forwarded,
|
||||
"timestamp": int(asyncio.get_event_loop().time() * 1000),
|
||||
})
|
||||
return
|
||||
|
||||
elif msg_type == "xtts_response":
|
||||
# XTTS-Audio vom Gaming-PC empfangen → an App weiterleiten
|
||||
# Legacy-Pfad (alte XTTS-Bridge mit WAV-Response). Weiterleiten als
|
||||
# type "audio" — App nutzt den bestehenden WAV-Queue-Spieler.
|
||||
audio_b64 = payload.get("base64", "")
|
||||
error = payload.get("error", "")
|
||||
req_id_full = payload.get("requestId", "")
|
||||
# XTTS-Bridge suffixt chunkweise: "uuid_0", "uuid_1" → Basis-UUID extrahieren
|
||||
req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
|
||||
linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
|
||||
if error:
|
||||
logger.warning("[rvs] XTTS Fehler: %s", error)
|
||||
return
|
||||
if audio_b64:
|
||||
logger.info("[rvs] XTTS-Audio empfangen: %dKB", len(audio_b64) // 1365)
|
||||
logger.info("[rvs] XTTS-Audio legacy empfangen: %dKB", len(audio_b64) // 1365)
|
||||
await self._send_to_rvs({
|
||||
"type": "audio",
|
||||
"payload": {
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ const ALLOWED_TYPES = new Set([
|
|||
"xtts_request", "xtts_response", "xtts_list_voices", "xtts_voices_list", "voice_upload", "xtts_voice_saved",
|
||||
"update_check", "update_available", "update_download", "update_data",
|
||||
"agent_activity", "cancel_request",
|
||||
"audio_pcm",
|
||||
]);
|
||||
|
||||
// Token-Raum: token -> { clients: Set<ws> }
|
||||
|
|
|
|||
168
xtts/bridge.js
168
xtts/bridge.js
|
|
@ -94,34 +94,33 @@ function connectRVS(forcePlain) {
|
|||
// ── TTS Request Handler ─────────────────────────────
|
||||
|
||||
async function handleTTSRequest(payload) {
|
||||
const { text, voice, requestId, language } = payload;
|
||||
const { text, voice, requestId, language, messageId } = payload;
|
||||
if (!text) return;
|
||||
|
||||
// Markdown + Sonderzeichen entfernen fuer natuerliche Sprache
|
||||
// Markdown-Cleanup (Bridge macht jetzt auch Cleanup, aber safety net)
|
||||
let cleanText = text
|
||||
.replace(/\*\*([^*]+)\*\*/g, "$1") // **fett** → fett
|
||||
.replace(/\*([^*]+)\*/g, "$1") // *kursiv* → kursiv
|
||||
.replace(/`([^`]+)`/g, "$1") // `code` → code
|
||||
.replace(/```[\s\S]*?```/g, "") // Code-Bloecke entfernen
|
||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") // [text](url) → text
|
||||
.replace(/#{1,6}\s*/g, "") // ### Ueberschriften → entfernen
|
||||
.replace(/>\s*/g, "") // > Zitate → entfernen
|
||||
.replace(/[-*]\s+/g, "") // - Listen → entfernen
|
||||
.replace(/\n{2,}/g, ". ") // Mehrere Newlines → Punkt
|
||||
.replace(/\n/g, ", ") // Einzelne Newlines → Komma
|
||||
.replace(/\s{2,}/g, " ") // Mehrfach-Leerzeichen
|
||||
.replace(/["""„]/g, "") // Anfuehrungszeichen entfernen
|
||||
.replace(/\(\)/g, "") // Leere Klammern
|
||||
.replace(/\*\*([^*]+)\*\*/g, "$1")
|
||||
.replace(/\*([^*]+)\*/g, "$1")
|
||||
.replace(/`([^`]+)`/g, "$1")
|
||||
.replace(/```[\s\S]*?```/g, "")
|
||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
|
||||
.replace(/#{1,6}\s*/g, "")
|
||||
.replace(/>\s*/g, "")
|
||||
.replace(/[-*]\s+/g, "")
|
||||
.replace(/\n{2,}/g, ". ")
|
||||
.replace(/\n/g, ", ")
|
||||
.replace(/\s{2,}/g, " ")
|
||||
.replace(/["""„]/g, "")
|
||||
.replace(/\(\)/g, "")
|
||||
.trim();
|
||||
|
||||
// Text in Saetze aufteilen, dann zu Chunks von 2-3 Saetzen zusammenfassen
|
||||
// (mehr Kontext = konsistentere Stimme/Lautstaerke, aber nicht zu lang fuer WebSocket)
|
||||
// Satzweise Chunks (XTTS Modell laedt Context pro Call — Saetze gruppieren)
|
||||
const sentences = cleanText.split(/(?<=[.!?])\s+/)
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 0)
|
||||
.map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen
|
||||
.map(s => s.replace(/[.]+$/, ''));
|
||||
|
||||
const MAX_CHUNK_CHARS = 150; // Max ~150 Zeichen pro Chunk (schnelles Rendering, Preloading reicht)
|
||||
const MAX_CHUNK_CHARS = 150;
|
||||
const chunks = [];
|
||||
let currentChunk = '';
|
||||
for (const sentence of sentences) {
|
||||
|
|
@ -135,45 +134,70 @@ async function handleTTSRequest(payload) {
|
|||
if (currentChunk) chunks.push(currentChunk);
|
||||
if (chunks.length === 0) return;
|
||||
|
||||
log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze → ${chunks.length} Chunks, voice: ${voice || "default"}, lang: ${language || "de"})`);
|
||||
log(`TTS-Request (streaming): "${cleanText.slice(0, 60)}..." (${chunks.length} Chunks, voice: ${voice || "default"})`);
|
||||
|
||||
try {
|
||||
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
|
||||
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
|
||||
|
||||
// Streaming: Chunk rendern → sofort senden → naechster Chunk
|
||||
// App spielt mit Preloading-Queue nahtlos ab
|
||||
let sentCount = 0;
|
||||
let chunkIndex = 0;
|
||||
// Audio-Format (aus WAV-Header extrahiert, einmal pro Request)
|
||||
let pcmMeta = null;
|
||||
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
const isLastChunk = i === chunks.length - 1;
|
||||
try {
|
||||
const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null);
|
||||
|
||||
if (audioBuffer && audioBuffer.length > 100) {
|
||||
log(`TTS [${i + 1}/${chunks.length}]: ${(audioBuffer.length / 1024).toFixed(0)}KB — "${chunk.slice(0, 50)}"`);
|
||||
// Streaming: PCM-Frames werden nacheinander an RVS gepusht,
|
||||
// sobald sie vom XTTS-Server reinkommen
|
||||
await streamXTTSAsPCM(
|
||||
chunk,
|
||||
language || "de",
|
||||
hasCustomVoice ? voiceSample : null,
|
||||
(pcmBase64, meta) => {
|
||||
if (!pcmMeta) pcmMeta = meta;
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: pcmBase64,
|
||||
format: "pcm_s16le",
|
||||
sampleRate: meta.sampleRate,
|
||||
channels: meta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: false,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
// Nach letztem Text-Chunk: final-Flag senden damit App weiss "fertig"
|
||||
if (isLastChunk && pcmMeta) {
|
||||
sendToRVS({
|
||||
type: "xtts_response",
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: `${requestId || ""}_${i}`,
|
||||
base64: audioBuffer.toString("base64"),
|
||||
mimeType: "audio/wav",
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: "",
|
||||
format: "pcm_s16le",
|
||||
sampleRate: pcmMeta.sampleRate,
|
||||
channels: pcmMeta.channels,
|
||||
voice: voice || "default",
|
||||
engine: "xtts",
|
||||
part: i + 1,
|
||||
totalParts: chunks.length,
|
||||
chunk: chunkIndex++,
|
||||
final: true,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
sentCount++;
|
||||
}
|
||||
} catch (chunkErr) {
|
||||
log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
|
||||
}
|
||||
}
|
||||
|
||||
log(`TTS komplett: ${sentCount}/${chunks.length} Chunks gestreamt`);
|
||||
log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);
|
||||
} catch (err) {
|
||||
log(`TTS Fehler: ${err.message}`);
|
||||
sendToRVS({
|
||||
|
|
@ -184,7 +208,19 @@ async function handleTTSRequest(payload) {
|
|||
}
|
||||
}
|
||||
|
||||
function callXTTSAPI(text, language, speakerWav) {
|
||||
/**
|
||||
* Ruft /tts_to_audio/ auf und streamt das resultierende WAV bereits waehrend
|
||||
* des Empfangs in PCM-Frames an den Callback. Der WAV-Header wird einmal
|
||||
* geparst, danach werden nur noch raw PCM-Samples weitergeleitet.
|
||||
*
|
||||
* Warum nicht echtes /tts_stream/? daswer123 hat den Endpoint, aber die
|
||||
* Audio-Quality ist dort niedriger und er produziert beim ersten Chunk
|
||||
* oft Artefakte. Pragmatischer Weg: /tts_to_audio/ + Response-Stream
|
||||
* chunkweise auslesen. Das ist zwar kein echtes Server-Streaming, aber
|
||||
* gibt uns deutlich kleinere Netzwerk-Haeppchen und die App kann via
|
||||
* AudioTrack MODE_STREAM sofort nahtlos abspielen.
|
||||
*/
|
||||
function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const body = JSON.stringify({
|
||||
text,
|
||||
|
|
@ -206,15 +242,59 @@ function callXTTSAPI(text, language, speakerWav) {
|
|||
};
|
||||
|
||||
const req = http.request(options, (res) => {
|
||||
const chunks = [];
|
||||
res.on("data", (chunk) => chunks.push(chunk));
|
||||
res.on("end", () => {
|
||||
if (res.statusCode === 200) {
|
||||
resolve(Buffer.concat(chunks));
|
||||
} else {
|
||||
reject(new Error(`XTTS API HTTP ${res.statusCode}: ${Buffer.concat(chunks).toString().slice(0, 200)}`));
|
||||
if (res.statusCode !== 200) {
|
||||
let body = "";
|
||||
res.on("data", (d) => { body += d.toString(); });
|
||||
res.on("end", () => reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`)));
|
||||
return;
|
||||
}
|
||||
|
||||
let headerParsed = false;
|
||||
let sampleRate = 24000;
|
||||
let channels = 1;
|
||||
let leftover = Buffer.alloc(0); // ungerade Byte-Reste fuer das naechste Chunk
|
||||
const HEADER_BYTES = 44;
|
||||
let headerBuf = Buffer.alloc(0);
|
||||
const PCM_CHUNK_BYTES = 8192; // ~170ms bei 24kHz s16 mono
|
||||
|
||||
res.on("data", (chunk) => {
|
||||
let data = chunk;
|
||||
|
||||
// WAV-Header konsumieren (44 Bytes)
|
||||
if (!headerParsed) {
|
||||
headerBuf = Buffer.concat([headerBuf, data]);
|
||||
if (headerBuf.length < HEADER_BYTES) return;
|
||||
// Header lesen
|
||||
const header = headerBuf.slice(0, HEADER_BYTES);
|
||||
try {
|
||||
channels = header.readUInt16LE(22);
|
||||
sampleRate = header.readUInt32LE(24);
|
||||
} catch (_) {}
|
||||
headerParsed = true;
|
||||
data = headerBuf.slice(HEADER_BYTES);
|
||||
}
|
||||
|
||||
// leftover aus vorherigem Chunk + neuer data
|
||||
let combined = Buffer.concat([leftover, data]);
|
||||
|
||||
// In PCM_CHUNK_BYTES-Happen zerlegen (Vielfache von 2 damit keine Sample-Splits)
|
||||
while (combined.length >= PCM_CHUNK_BYTES) {
|
||||
const slice = combined.slice(0, PCM_CHUNK_BYTES);
|
||||
combined = combined.slice(PCM_CHUNK_BYTES);
|
||||
onPcmChunk(slice.toString("base64"), { sampleRate, channels });
|
||||
}
|
||||
leftover = combined;
|
||||
});
|
||||
|
||||
res.on("end", () => {
|
||||
// Rest-Daten senden
|
||||
if (leftover.length > 0) {
|
||||
onPcmChunk(leftover.toString("base64"), { sampleRate, channels });
|
||||
}
|
||||
resolve();
|
||||
});
|
||||
|
||||
res.on("error", reject);
|
||||
});
|
||||
|
||||
req.on("error", reject);
|
||||
|
|
|
|||
Loading…
Reference in New Issue