diff --git a/android/android/app/src/main/java/com/ariacockpit/MainApplication.kt b/android/android/app/src/main/java/com/ariacockpit/MainApplication.kt index 0fd44b2..b2d0c47 100644 --- a/android/android/app/src/main/java/com/ariacockpit/MainApplication.kt +++ b/android/android/app/src/main/java/com/ariacockpit/MainApplication.kt @@ -20,6 +20,7 @@ class MainApplication : Application(), ReactApplication { PackageList(this).packages.apply { add(ApkInstallerPackage()) add(AudioFocusPackage()) + add(PcmStreamPlayerPackage()) } override fun getJSMainModuleName(): String = "index" diff --git a/android/android/app/src/main/java/com/ariacockpit/PcmStreamPlayerModule.kt b/android/android/app/src/main/java/com/ariacockpit/PcmStreamPlayerModule.kt new file mode 100644 index 0000000..ab8d8b1 --- /dev/null +++ b/android/android/app/src/main/java/com/ariacockpit/PcmStreamPlayerModule.kt @@ -0,0 +1,158 @@ +package com.ariacockpit + +import android.media.AudioAttributes +import android.media.AudioFormat +import android.media.AudioManager +import android.media.AudioTrack +import android.util.Base64 +import android.util.Log +import com.facebook.react.bridge.Promise +import com.facebook.react.bridge.ReactApplicationContext +import com.facebook.react.bridge.ReactContextBaseJavaModule +import com.facebook.react.bridge.ReactMethod +import java.util.concurrent.LinkedBlockingQueue + +/** + * Streamt PCM-s16le Audio direkt via AudioTrack MODE_STREAM. + * + * Flow: + * JS: start(sampleRate, channels) → öffnet AudioTrack und startet Writer-Thread + * JS: writeChunk(base64) → dekodiert, queued, Writer schreibt non-blocking + * JS: end() → wartet bis Queue leer, schließt AudioTrack + * JS: stop() → Hart stoppen, Queue leeren (Cancel) + * + * Vorteil gegenüber Sound-File-Queue: + * - Keine Gap zwischen Chunks (AudioTrack puffert intern) + * - Erste Samples beginnen zu spielen sobald der erste Chunk da ist + * - Kein WAV-Header-Parsing pro Chunk + */ +class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) { + companion object { + private const val TAG = "PcmStreamPlayer" + } + + override fun getName() = "PcmStreamPlayer" + + private var track: AudioTrack? = null + private val queue = LinkedBlockingQueue() + private var writerThread: Thread? = null + @Volatile private var writerShouldStop = false + @Volatile private var endRequested = false + + // ── Lifecycle ── + + @ReactMethod + fun start(sampleRate: Int, channels: Int, promise: Promise) { + try { + // Alte Session beenden falls vorhanden + stopInternal() + + val channelConfig = if (channels == 2) AudioFormat.CHANNEL_OUT_STEREO else AudioFormat.CHANNEL_OUT_MONO + val encoding = AudioFormat.ENCODING_PCM_16BIT + val minBuf = AudioTrack.getMinBufferSize(sampleRate, channelConfig, encoding) + // Etwas grosszuegiger Buffer: 8x MinSize (ca. 200-400ms bei 24kHz) — glatt auch bei kleinen Netzwerk-Aussetzern + val bufferSize = (minBuf * 8).coerceAtLeast(32 * 1024) + + val newTrack = AudioTrack.Builder() + .setAudioAttributes( + AudioAttributes.Builder() + .setUsage(AudioAttributes.USAGE_ASSISTANT) + .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .build(), + ) + .setAudioFormat( + AudioFormat.Builder() + .setSampleRate(sampleRate) + .setChannelMask(channelConfig) + .setEncoding(encoding) + .build(), + ) + .setBufferSizeInBytes(bufferSize) + .setTransferMode(AudioTrack.MODE_STREAM) + .build() + + newTrack.play() + track = newTrack + queue.clear() + writerShouldStop = false + endRequested = false + + writerThread = Thread({ + val t = track ?: return@Thread + try { + while (!writerShouldStop) { + val data = queue.poll(50, java.util.concurrent.TimeUnit.MILLISECONDS) ?: run { + if (endRequested) return@Thread + null + } ?: continue + var offset = 0 + while (offset < data.size && !writerShouldStop) { + val written = t.write(data, offset, data.size - offset) + if (written <= 0) break + offset += written + } + } + } catch (e: Exception) { + Log.w(TAG, "Writer-Thread Fehler: ${e.message}") + } finally { + try { t.stop() } catch (_: Exception) {} + try { t.release() } catch (_: Exception) {} + } + }, "PcmStreamWriter").apply { start() } + + Log.i(TAG, "Stream gestartet: ${sampleRate}Hz ch=$channels buf=${bufferSize}B") + promise.resolve(true) + } catch (e: Exception) { + Log.e(TAG, "start fehlgeschlagen", e) + promise.reject("START_FAILED", e.message, e) + } + } + + @ReactMethod + fun writeChunk(base64Pcm: String, promise: Promise) { + try { + if (base64Pcm.isEmpty()) { + promise.resolve(true) + return + } + val bytes = Base64.decode(base64Pcm, Base64.DEFAULT) + queue.put(bytes) + promise.resolve(true) + } catch (e: Exception) { + promise.reject("WRITE_FAILED", e.message, e) + } + } + + /** Signalisiert: keine weiteren Chunks. Writer wartet auf Queue-Abschluss, dann stoppt. */ + @ReactMethod + fun end(promise: Promise) { + endRequested = true + promise.resolve(true) + } + + /** Harter Stop (Cancel) — Queue verwerfen. */ + @ReactMethod + fun stop(promise: Promise) { + stopInternal() + promise.resolve(true) + } + + private fun stopInternal() { + writerShouldStop = true + endRequested = true + queue.clear() + writerThread?.interrupt() + writerThread = null + val t = track + if (t != null) { + try { t.stop() } catch (_: Exception) {} + try { t.release() } catch (_: Exception) {} + } + track = null + } + + override fun onCatalystInstanceDestroy() { + stopInternal() + super.onCatalystInstanceDestroy() + } +} diff --git a/android/android/app/src/main/java/com/ariacockpit/PcmStreamPlayerPackage.kt b/android/android/app/src/main/java/com/ariacockpit/PcmStreamPlayerPackage.kt new file mode 100644 index 0000000..3d53caa --- /dev/null +++ b/android/android/app/src/main/java/com/ariacockpit/PcmStreamPlayerPackage.kt @@ -0,0 +1,16 @@ +package com.ariacockpit + +import com.facebook.react.ReactPackage +import com.facebook.react.bridge.NativeModule +import com.facebook.react.bridge.ReactApplicationContext +import com.facebook.react.uimanager.ViewManager + +class PcmStreamPlayerPackage : ReactPackage { + override fun createNativeModules(reactContext: ReactApplicationContext): List { + return listOf(PcmStreamPlayerModule(reactContext)) + } + + override fun createViewManagers(reactContext: ReactApplicationContext): List> { + return emptyList() + } +} diff --git a/android/src/screens/ChatScreen.tsx b/android/src/screens/ChatScreen.tsx index 0fb589a..732ec48 100644 --- a/android/src/screens/ChatScreen.tsx +++ b/android/src/screens/ChatScreen.tsx @@ -274,6 +274,20 @@ const ChatScreen: React.FC = () => { } } + // XTTS PCM-Stream: direkt an AudioTrack, bei final WAV-Cache schreiben + if (message.type === ('audio_pcm' as any)) { + const p = message.payload as any; + const refId = (p.messageId as string) || ''; + audioService.handlePcmChunk(p).then((audioPath: any) => { + // Wenn final + Cache-Pfad zurueckkam, Message aktualisieren + if (p.final && audioPath && refId) { + setMessages(prev => prev.map(m => + m.messageId === refId ? { ...m, audioPath } : m + )); + } + }).catch(() => {}); + } + // Thinking-Indicator Status von der Bridge if (message.type === 'agent_activity') { const activity = (message.payload.activity as string) || 'idle'; diff --git a/android/src/services/audio.ts b/android/src/services/audio.ts index 32bea10..2a2814f 100644 --- a/android/src/services/audio.ts +++ b/android/src/services/audio.ts @@ -16,13 +16,36 @@ import AudioRecorderPlayer, { OutputFormatAndroidType, } from 'react-native-audio-recorder-player'; +// Base64-Encoder fuer Binary-Strings (Header-Bytes → Base64) +const B64_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'; +function btoaSafe(bin: string): string { + let out = ''; + const len = bin.length; + for (let i = 0; i < len; i += 3) { + const b1 = bin.charCodeAt(i) & 0xff; + const b2 = i + 1 < len ? bin.charCodeAt(i + 1) & 0xff : 0; + const b3 = i + 2 < len ? bin.charCodeAt(i + 2) & 0xff : 0; + out += B64_CHARS[b1 >> 2]; + out += B64_CHARS[((b1 & 0x03) << 4) | (b2 >> 4)]; + out += i + 1 < len ? B64_CHARS[((b2 & 0x0f) << 2) | (b3 >> 6)] : '='; + out += i + 2 < len ? B64_CHARS[b3 & 0x3f] : '='; + } + return out; +} + // Native Module fuer Audio-Focus (Ducking/Muten anderer Apps) -const { AudioFocus } = NativeModules as { +const { AudioFocus, PcmStreamPlayer } = NativeModules as { AudioFocus?: { requestDuck: () => Promise; requestExclusive: () => Promise; release: () => Promise; }; + PcmStreamPlayer?: { + start: (sampleRate: number, channels: number) => Promise; + writeChunk: (base64Pcm: string) => Promise; + end: () => Promise; + stop: () => Promise; + }; }; // --- Typen --- @@ -79,6 +102,15 @@ class AudioService { private speechDetected: boolean = false; private speechStartTime: number = 0; + // PCM-Stream (XTTS): aktive Session + Cache-Puffer pro messageId + private pcmStreamActive: boolean = false; + private pcmMessageId: string = ''; + private pcmSampleRate: number = 24000; + private pcmChannels: number = 1; + private pcmBuffer: string[] = []; // base64-chunks zum spaeteren WAV-Build + private pcmBytesCollected: number = 0; + private readonly PCM_MAX_CACHE_BYTES = 30 * 1024 * 1024; // 30MB + // VAD State private vadEnabled: boolean = false; private lastSpeechTime: number = 0; @@ -303,6 +335,141 @@ class AudioService { } } + /** Einen PCM-Chunk aus einer audio_pcm Nachricht empfangen und spielen/cachen. + * Gibt bei final=true den Cache-Pfad zurueck (file://) oder '' wenn nicht gecached. */ + async handlePcmChunk(payload: { + base64: string; + sampleRate?: number; + channels?: number; + messageId?: string; + chunk?: number; + final?: boolean; + }): Promise { + if (!PcmStreamPlayer) { + console.warn('[Audio] PcmStreamPlayer Native Module nicht verfuegbar'); + return ''; + } + + const messageId = payload.messageId || ''; + const sampleRate = payload.sampleRate || 24000; + const channels = payload.channels || 1; + const base64 = payload.base64 || ''; + const isFinal = !!payload.final; + + // Neuer Stream? (messageId Wechsel oder nicht aktiv) + if (!this.pcmStreamActive || this.pcmMessageId !== messageId) { + // Vorherigen Stream clean beenden (falls da) + if (this.pcmStreamActive) { + try { await PcmStreamPlayer.stop(); } catch {} + // Altes Buffer verwerfen (wurde nicht final — neue Message kam dazwischen) + this.pcmBuffer = []; + this.pcmBytesCollected = 0; + } + this.pcmStreamActive = true; + this.pcmMessageId = messageId; + this.pcmSampleRate = sampleRate; + this.pcmChannels = channels; + this.pcmBuffer = []; + this.pcmBytesCollected = 0; + try { + await PcmStreamPlayer.start(sampleRate, channels); + } catch (err) { + console.error('[Audio] PcmStreamPlayer.start fehlgeschlagen:', err); + this.pcmStreamActive = false; + return ''; + } + // Audio-Focus: andere Apps ducken + AudioFocus?.requestDuck().catch(() => {}); + } + + // Chunk abspielen + cachen + if (base64) { + try { await PcmStreamPlayer.writeChunk(base64); } catch (err) { console.warn('[Audio] writeChunk', err); } + // Buffer fuer Cache sammeln (wenn noch nicht zu gross) + if (messageId && this.pcmBytesCollected < this.PCM_MAX_CACHE_BYTES) { + this.pcmBuffer.push(base64); + // 4 base64-chars ≈ 3 bytes — grobe Schaetzung + this.pcmBytesCollected += Math.floor(base64.length * 0.75); + } + } + + if (isFinal) { + // Stream sauber beenden (spielt noch bis Puffer leer ist) + try { await PcmStreamPlayer.end(); } catch {} + this.pcmStreamActive = false; + AudioFocus?.release().catch(() => {}); + + // Aus gesammelten PCM-Chunks eine WAV-Datei fuer Replay bauen + if (messageId && this.pcmBuffer.length > 0) { + const audioPath = await this._savePcmBufferAsWav(messageId); + this.pcmBuffer = []; + this.pcmBytesCollected = 0; + this.pcmMessageId = ''; + return audioPath; + } + this.pcmMessageId = ''; + } + return ''; + } + + /** Gesammelte PCM-Chunks als WAV speichern. Gibt file:// Pfad zurueck. */ + private async _savePcmBufferAsWav(messageId: string): Promise { + try { + const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`; + await RNFS.mkdir(dir).catch(() => {}); + const path = `${dir}/${messageId}.wav`; + + // WAV-Header fuer PCM s16le + const sampleRate = this.pcmSampleRate; + const channels = this.pcmChannels; + const bitsPerSample = 16; + const byteRate = sampleRate * channels * bitsPerSample / 8; + const blockAlign = channels * bitsPerSample / 8; + const dataSize = this.pcmBytesCollected; + const fileSize = 36 + dataSize; + + // Header als Base64 (44 bytes) + const header = new Uint8Array(44); + const dv = new DataView(header.buffer); + // "RIFF" + header[0] = 0x52; header[1] = 0x49; header[2] = 0x46; header[3] = 0x46; + dv.setUint32(4, fileSize, true); + // "WAVE" + header[8] = 0x57; header[9] = 0x41; header[10] = 0x56; header[11] = 0x45; + // "fmt " + header[12] = 0x66; header[13] = 0x6d; header[14] = 0x74; header[15] = 0x20; + dv.setUint32(16, 16, true); // fmt chunk size + dv.setUint16(20, 1, true); // PCM format + dv.setUint16(22, channels, true); + dv.setUint32(24, sampleRate, true); + dv.setUint32(28, byteRate, true); + dv.setUint16(32, blockAlign, true); + dv.setUint16(34, bitsPerSample, true); + // "data" + header[36] = 0x64; header[37] = 0x61; header[38] = 0x74; header[39] = 0x61; + dv.setUint32(40, dataSize, true); + + // Header als base64 + let headerB64 = ''; + const chunk = 1024; + for (let i = 0; i < header.length; i += chunk) { + headerB64 += String.fromCharCode(...Array.from(header.slice(i, i + chunk))); + } + headerB64 = btoaSafe(headerB64); + + // Datei schreiben: Header + alle PCM-Chunks + await RNFS.writeFile(path, headerB64, 'base64'); + for (const b64 of this.pcmBuffer) { + await RNFS.appendFile(path, b64, 'base64'); + } + console.log(`[Audio] PCM-Cache geschrieben: ${path} (${(dataSize / 1024).toFixed(0)}KB, ${this.pcmBuffer.length} chunks)`); + return `file://${path}`; + } catch (err) { + console.warn('[Audio] _savePcmBufferAsWav fehlgeschlagen:', err); + return ''; + } + } + /** Audio aus lokaler Datei (file:// Pfad) in die Queue und abspielen. */ async playFromPath(filePath: string): Promise { if (!filePath) return; @@ -419,6 +586,14 @@ class AudioService { if (this.preloadedPath) RNFS.unlink(this.preloadedPath).catch(() => {}); this.preloadedPath = ''; } + // PCM-Stream ebenfalls hart stoppen (Cancel/Abbruch) + if (this.pcmStreamActive) { + PcmStreamPlayer?.stop().catch(() => {}); + this.pcmStreamActive = false; + this.pcmBuffer = []; + this.pcmBytesCollected = 0; + this.pcmMessageId = ''; + } // Audio-Focus freigeben AudioFocus?.release().catch(() => {}); } diff --git a/bridge/aria_bridge.py b/bridge/aria_bridge.py index 9589f09..2903cce 100644 --- a/bridge/aria_bridge.py +++ b/bridge/aria_bridge.py @@ -1296,19 +1296,41 @@ class ARIABridge: await self._emit_activity("idle", "") return + elif msg_type == "audio_pcm": + # XTTS-PCM-Stream vom Gaming-PC empfangen → durchleiten zur App. + # Wenn in payload kein messageId (alte XTTS-Bridge), aus requestId auflösen. + error = payload.get("error", "") + if error: + logger.warning("[rvs] XTTS PCM-Fehler: %s", error) + return + linked_message_id = payload.get("messageId", "") + if not linked_message_id: + req_id_full = payload.get("requestId", "") + req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full + linked_message_id = self._xtts_request_to_message.get(req_id_base, "") + # Einfach 1:1 weiterleiten mit eingefuellter messageId + forwarded = dict(payload) + forwarded["messageId"] = linked_message_id + await self._send_to_rvs({ + "type": "audio_pcm", + "payload": forwarded, + "timestamp": int(asyncio.get_event_loop().time() * 1000), + }) + return + elif msg_type == "xtts_response": - # XTTS-Audio vom Gaming-PC empfangen → an App weiterleiten + # Legacy-Pfad (alte XTTS-Bridge mit WAV-Response). Weiterleiten als + # type "audio" — App nutzt den bestehenden WAV-Queue-Spieler. audio_b64 = payload.get("base64", "") error = payload.get("error", "") req_id_full = payload.get("requestId", "") - # XTTS-Bridge suffixt chunkweise: "uuid_0", "uuid_1" → Basis-UUID extrahieren req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full linked_message_id = self._xtts_request_to_message.get(req_id_base, "") if error: logger.warning("[rvs] XTTS Fehler: %s", error) return if audio_b64: - logger.info("[rvs] XTTS-Audio empfangen: %dKB", len(audio_b64) // 1365) + logger.info("[rvs] XTTS-Audio legacy empfangen: %dKB", len(audio_b64) // 1365) await self._send_to_rvs({ "type": "audio", "payload": { diff --git a/rvs/server.js b/rvs/server.js index 15a73d0..e381b57 100644 --- a/rvs/server.js +++ b/rvs/server.js @@ -17,6 +17,7 @@ const ALLOWED_TYPES = new Set([ "xtts_request", "xtts_response", "xtts_list_voices", "xtts_voices_list", "voice_upload", "xtts_voice_saved", "update_check", "update_available", "update_download", "update_data", "agent_activity", "cancel_request", + "audio_pcm", ]); // Token-Raum: token -> { clients: Set } diff --git a/xtts/bridge.js b/xtts/bridge.js index 54859bf..7c2a0db 100644 --- a/xtts/bridge.js +++ b/xtts/bridge.js @@ -94,34 +94,33 @@ function connectRVS(forcePlain) { // ── TTS Request Handler ───────────────────────────── async function handleTTSRequest(payload) { - const { text, voice, requestId, language } = payload; + const { text, voice, requestId, language, messageId } = payload; if (!text) return; - // Markdown + Sonderzeichen entfernen fuer natuerliche Sprache + // Markdown-Cleanup (Bridge macht jetzt auch Cleanup, aber safety net) let cleanText = text - .replace(/\*\*([^*]+)\*\*/g, "$1") // **fett** → fett - .replace(/\*([^*]+)\*/g, "$1") // *kursiv* → kursiv - .replace(/`([^`]+)`/g, "$1") // `code` → code - .replace(/```[\s\S]*?```/g, "") // Code-Bloecke entfernen - .replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") // [text](url) → text - .replace(/#{1,6}\s*/g, "") // ### Ueberschriften → entfernen - .replace(/>\s*/g, "") // > Zitate → entfernen - .replace(/[-*]\s+/g, "") // - Listen → entfernen - .replace(/\n{2,}/g, ". ") // Mehrere Newlines → Punkt - .replace(/\n/g, ", ") // Einzelne Newlines → Komma - .replace(/\s{2,}/g, " ") // Mehrfach-Leerzeichen - .replace(/["""„]/g, "") // Anfuehrungszeichen entfernen - .replace(/\(\)/g, "") // Leere Klammern + .replace(/\*\*([^*]+)\*\*/g, "$1") + .replace(/\*([^*]+)\*/g, "$1") + .replace(/`([^`]+)`/g, "$1") + .replace(/```[\s\S]*?```/g, "") + .replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") + .replace(/#{1,6}\s*/g, "") + .replace(/>\s*/g, "") + .replace(/[-*]\s+/g, "") + .replace(/\n{2,}/g, ". ") + .replace(/\n/g, ", ") + .replace(/\s{2,}/g, " ") + .replace(/["""„]/g, "") + .replace(/\(\)/g, "") .trim(); - // Text in Saetze aufteilen, dann zu Chunks von 2-3 Saetzen zusammenfassen - // (mehr Kontext = konsistentere Stimme/Lautstaerke, aber nicht zu lang fuer WebSocket) + // Satzweise Chunks (XTTS Modell laedt Context pro Call — Saetze gruppieren) const sentences = cleanText.split(/(?<=[.!?])\s+/) .map(s => s.trim()) .filter(s => s.length > 0) - .map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen + .map(s => s.replace(/[.]+$/, '')); - const MAX_CHUNK_CHARS = 150; // Max ~150 Zeichen pro Chunk (schnelles Rendering, Preloading reicht) + const MAX_CHUNK_CHARS = 150; const chunks = []; let currentChunk = ''; for (const sentence of sentences) { @@ -135,45 +134,70 @@ async function handleTTSRequest(payload) { if (currentChunk) chunks.push(currentChunk); if (chunks.length === 0) return; - log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze → ${chunks.length} Chunks, voice: ${voice || "default"}, lang: ${language || "de"})`); + log(`TTS-Request (streaming): "${cleanText.slice(0, 60)}..." (${chunks.length} Chunks, voice: ${voice || "default"})`); try { const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null; const hasCustomVoice = voiceSample && fs.existsSync(voiceSample); - // Streaming: Chunk rendern → sofort senden → naechster Chunk - // App spielt mit Preloading-Queue nahtlos ab - let sentCount = 0; + let chunkIndex = 0; + // Audio-Format (aus WAV-Header extrahiert, einmal pro Request) + let pcmMeta = null; for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; + const isLastChunk = i === chunks.length - 1; try { - const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null); - - if (audioBuffer && audioBuffer.length > 100) { - log(`TTS [${i + 1}/${chunks.length}]: ${(audioBuffer.length / 1024).toFixed(0)}KB — "${chunk.slice(0, 50)}"`); + // Streaming: PCM-Frames werden nacheinander an RVS gepusht, + // sobald sie vom XTTS-Server reinkommen + await streamXTTSAsPCM( + chunk, + language || "de", + hasCustomVoice ? voiceSample : null, + (pcmBase64, meta) => { + if (!pcmMeta) pcmMeta = meta; + sendToRVS({ + type: "audio_pcm", + payload: { + requestId: requestId || "", + messageId: messageId || "", + base64: pcmBase64, + format: "pcm_s16le", + sampleRate: meta.sampleRate, + channels: meta.channels, + voice: voice || "default", + chunk: chunkIndex++, + final: false, + }, + timestamp: Date.now(), + }); + }, + ); + // Nach letztem Text-Chunk: final-Flag senden damit App weiss "fertig" + if (isLastChunk && pcmMeta) { sendToRVS({ - type: "xtts_response", + type: "audio_pcm", payload: { - requestId: `${requestId || ""}_${i}`, - base64: audioBuffer.toString("base64"), - mimeType: "audio/wav", + requestId: requestId || "", + messageId: messageId || "", + base64: "", + format: "pcm_s16le", + sampleRate: pcmMeta.sampleRate, + channels: pcmMeta.channels, voice: voice || "default", - engine: "xtts", - part: i + 1, - totalParts: chunks.length, + chunk: chunkIndex++, + final: true, }, timestamp: Date.now(), }); - sentCount++; } } catch (chunkErr) { log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`); } } - log(`TTS komplett: ${sentCount}/${chunks.length} Chunks gestreamt`); + log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`); } catch (err) { log(`TTS Fehler: ${err.message}`); sendToRVS({ @@ -184,7 +208,19 @@ async function handleTTSRequest(payload) { } } -function callXTTSAPI(text, language, speakerWav) { +/** + * Ruft /tts_to_audio/ auf und streamt das resultierende WAV bereits waehrend + * des Empfangs in PCM-Frames an den Callback. Der WAV-Header wird einmal + * geparst, danach werden nur noch raw PCM-Samples weitergeleitet. + * + * Warum nicht echtes /tts_stream/? daswer123 hat den Endpoint, aber die + * Audio-Quality ist dort niedriger und er produziert beim ersten Chunk + * oft Artefakte. Pragmatischer Weg: /tts_to_audio/ + Response-Stream + * chunkweise auslesen. Das ist zwar kein echtes Server-Streaming, aber + * gibt uns deutlich kleinere Netzwerk-Haeppchen und die App kann via + * AudioTrack MODE_STREAM sofort nahtlos abspielen. + */ +function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) { return new Promise((resolve, reject) => { const body = JSON.stringify({ text, @@ -206,15 +242,59 @@ function callXTTSAPI(text, language, speakerWav) { }; const req = http.request(options, (res) => { - const chunks = []; - res.on("data", (chunk) => chunks.push(chunk)); - res.on("end", () => { - if (res.statusCode === 200) { - resolve(Buffer.concat(chunks)); - } else { - reject(new Error(`XTTS API HTTP ${res.statusCode}: ${Buffer.concat(chunks).toString().slice(0, 200)}`)); + if (res.statusCode !== 200) { + let body = ""; + res.on("data", (d) => { body += d.toString(); }); + res.on("end", () => reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`))); + return; + } + + let headerParsed = false; + let sampleRate = 24000; + let channels = 1; + let leftover = Buffer.alloc(0); // ungerade Byte-Reste fuer das naechste Chunk + const HEADER_BYTES = 44; + let headerBuf = Buffer.alloc(0); + const PCM_CHUNK_BYTES = 8192; // ~170ms bei 24kHz s16 mono + + res.on("data", (chunk) => { + let data = chunk; + + // WAV-Header konsumieren (44 Bytes) + if (!headerParsed) { + headerBuf = Buffer.concat([headerBuf, data]); + if (headerBuf.length < HEADER_BYTES) return; + // Header lesen + const header = headerBuf.slice(0, HEADER_BYTES); + try { + channels = header.readUInt16LE(22); + sampleRate = header.readUInt32LE(24); + } catch (_) {} + headerParsed = true; + data = headerBuf.slice(HEADER_BYTES); } + + // leftover aus vorherigem Chunk + neuer data + let combined = Buffer.concat([leftover, data]); + + // In PCM_CHUNK_BYTES-Happen zerlegen (Vielfache von 2 damit keine Sample-Splits) + while (combined.length >= PCM_CHUNK_BYTES) { + const slice = combined.slice(0, PCM_CHUNK_BYTES); + combined = combined.slice(PCM_CHUNK_BYTES); + onPcmChunk(slice.toString("base64"), { sampleRate, channels }); + } + leftover = combined; }); + + res.on("end", () => { + // Rest-Daten senden + if (leftover.length > 0) { + onPcmChunk(leftover.toString("base64"), { sampleRate, channels }); + } + resolve(); + }); + + res.on("error", reject); }); req.on("error", reject);