feat: Streaming TTS — PCM-Stream statt WAV-Chunks (Weg A)

Pipeline: XTTS-Server → xtts-bridge → aria-bridge → RVS → App AudioTrack

XTTS-Bridge (Gaming-PC):
- streamXTTSAsPCM(): liest /tts_to_audio/ Response inkrementell,
  parst WAV-Header (samplerate/channels), teilt PCM in 8KB-Chunks
  (~170ms bei 24kHz s16 mono) und sendet jeden als audio_pcm.
- Finaler Chunk mit final=true nach letztem Text-Chunk

aria-bridge:
- audio_pcm Handler leitet payload 1:1 weiter, filled messageId aus
  requestId → messageId Map falls XTTS-Bridge messageId nicht hatte
- Alter xtts_response Pfad bleibt als Legacy-Fallback (WAV)

RVS: audio_pcm in ALLOWED_TYPES

Android Native:
- PcmStreamPlayerModule (Kotlin): AudioTrack MODE_STREAM mit
  Writer-Thread und BlockingQueue. start(rate, ch) / writeChunk(b64)
  / end() / stop()
- 8x MinBufferSize grosszuegig dimensioniert, glatt auch bei
  Netz-Aussetzern
- Registered im MainApplication via PcmStreamPlayerPackage

App JS:
- audioService.handlePcmChunk(): erkennt neue Session (messageId-Wechsel),
  started nativen Stream, cached PCM-Bytes pro Message. Bei final=true
  Stream sauber schliessen + _savePcmBufferAsWav → WAV-File im
  tts_cache/<messageId>.wav
- _savePcmBufferAsWav: baut 44-byte WAV-Header (PCM s16le, korrekte
  samplerate/channels), haengt alle gesammelten base64-PCM-Chunks an
- stopPlayback beendet auch aktiven PCM-Stream
- ChatScreen routet type=audio_pcm an handlePcmChunk, bei final
  setzt audioPath in der Message

Play-Button: falls messageId einen audioPath hat → WAV aus Cache
(Sound-basiert), egal ob Original-TTS Piper oder XTTS war.

Audio-Focus:
- requestDuck() beim Stream-Start, release() bei Stream-Ende
- Andere Apps (Spotify etc.) werden leiser waehrend ARIA spricht

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
duffyduck 2026-04-19 22:01:27 +02:00
parent eb12281dfc
commit 6ab6196739
8 changed files with 515 additions and 48 deletions

View File

@ -20,6 +20,7 @@ class MainApplication : Application(), ReactApplication {
PackageList(this).packages.apply {
add(ApkInstallerPackage())
add(AudioFocusPackage())
add(PcmStreamPlayerPackage())
}
override fun getJSMainModuleName(): String = "index"

View File

@ -0,0 +1,158 @@
package com.ariacockpit
import android.media.AudioAttributes
import android.media.AudioFormat
import android.media.AudioManager
import android.media.AudioTrack
import android.util.Base64
import android.util.Log
import com.facebook.react.bridge.Promise
import com.facebook.react.bridge.ReactApplicationContext
import com.facebook.react.bridge.ReactContextBaseJavaModule
import com.facebook.react.bridge.ReactMethod
import java.util.concurrent.LinkedBlockingQueue
/**
* Streamt PCM-s16le Audio direkt via AudioTrack MODE_STREAM.
*
* Flow:
* JS: start(sampleRate, channels) öffnet AudioTrack und startet Writer-Thread
* JS: writeChunk(base64) dekodiert, queued, Writer schreibt non-blocking
* JS: end() wartet bis Queue leer, schließt AudioTrack
* JS: stop() Hart stoppen, Queue leeren (Cancel)
*
* Vorteil gegenüber Sound-File-Queue:
* - Keine Gap zwischen Chunks (AudioTrack puffert intern)
* - Erste Samples beginnen zu spielen sobald der erste Chunk da ist
* - Kein WAV-Header-Parsing pro Chunk
*/
class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
companion object {
private const val TAG = "PcmStreamPlayer"
}
override fun getName() = "PcmStreamPlayer"
private var track: AudioTrack? = null
private val queue = LinkedBlockingQueue<ByteArray>()
private var writerThread: Thread? = null
@Volatile private var writerShouldStop = false
@Volatile private var endRequested = false
// ── Lifecycle ──
@ReactMethod
fun start(sampleRate: Int, channels: Int, promise: Promise) {
try {
// Alte Session beenden falls vorhanden
stopInternal()
val channelConfig = if (channels == 2) AudioFormat.CHANNEL_OUT_STEREO else AudioFormat.CHANNEL_OUT_MONO
val encoding = AudioFormat.ENCODING_PCM_16BIT
val minBuf = AudioTrack.getMinBufferSize(sampleRate, channelConfig, encoding)
// Etwas grosszuegiger Buffer: 8x MinSize (ca. 200-400ms bei 24kHz) — glatt auch bei kleinen Netzwerk-Aussetzern
val bufferSize = (minBuf * 8).coerceAtLeast(32 * 1024)
val newTrack = AudioTrack.Builder()
.setAudioAttributes(
AudioAttributes.Builder()
.setUsage(AudioAttributes.USAGE_ASSISTANT)
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.build(),
)
.setAudioFormat(
AudioFormat.Builder()
.setSampleRate(sampleRate)
.setChannelMask(channelConfig)
.setEncoding(encoding)
.build(),
)
.setBufferSizeInBytes(bufferSize)
.setTransferMode(AudioTrack.MODE_STREAM)
.build()
newTrack.play()
track = newTrack
queue.clear()
writerShouldStop = false
endRequested = false
writerThread = Thread({
val t = track ?: return@Thread
try {
while (!writerShouldStop) {
val data = queue.poll(50, java.util.concurrent.TimeUnit.MILLISECONDS) ?: run {
if (endRequested) return@Thread
null
} ?: continue
var offset = 0
while (offset < data.size && !writerShouldStop) {
val written = t.write(data, offset, data.size - offset)
if (written <= 0) break
offset += written
}
}
} catch (e: Exception) {
Log.w(TAG, "Writer-Thread Fehler: ${e.message}")
} finally {
try { t.stop() } catch (_: Exception) {}
try { t.release() } catch (_: Exception) {}
}
}, "PcmStreamWriter").apply { start() }
Log.i(TAG, "Stream gestartet: ${sampleRate}Hz ch=$channels buf=${bufferSize}B")
promise.resolve(true)
} catch (e: Exception) {
Log.e(TAG, "start fehlgeschlagen", e)
promise.reject("START_FAILED", e.message, e)
}
}
@ReactMethod
fun writeChunk(base64Pcm: String, promise: Promise) {
try {
if (base64Pcm.isEmpty()) {
promise.resolve(true)
return
}
val bytes = Base64.decode(base64Pcm, Base64.DEFAULT)
queue.put(bytes)
promise.resolve(true)
} catch (e: Exception) {
promise.reject("WRITE_FAILED", e.message, e)
}
}
/** Signalisiert: keine weiteren Chunks. Writer wartet auf Queue-Abschluss, dann stoppt. */
@ReactMethod
fun end(promise: Promise) {
endRequested = true
promise.resolve(true)
}
/** Harter Stop (Cancel) — Queue verwerfen. */
@ReactMethod
fun stop(promise: Promise) {
stopInternal()
promise.resolve(true)
}
private fun stopInternal() {
writerShouldStop = true
endRequested = true
queue.clear()
writerThread?.interrupt()
writerThread = null
val t = track
if (t != null) {
try { t.stop() } catch (_: Exception) {}
try { t.release() } catch (_: Exception) {}
}
track = null
}
override fun onCatalystInstanceDestroy() {
stopInternal()
super.onCatalystInstanceDestroy()
}
}

View File

@ -0,0 +1,16 @@
package com.ariacockpit
import com.facebook.react.ReactPackage
import com.facebook.react.bridge.NativeModule
import com.facebook.react.bridge.ReactApplicationContext
import com.facebook.react.uimanager.ViewManager
class PcmStreamPlayerPackage : ReactPackage {
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
return listOf(PcmStreamPlayerModule(reactContext))
}
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
return emptyList()
}
}

View File

@ -274,6 +274,20 @@ const ChatScreen: React.FC = () => {
}
}
// XTTS PCM-Stream: direkt an AudioTrack, bei final WAV-Cache schreiben
if (message.type === ('audio_pcm' as any)) {
const p = message.payload as any;
const refId = (p.messageId as string) || '';
audioService.handlePcmChunk(p).then((audioPath: any) => {
// Wenn final + Cache-Pfad zurueckkam, Message aktualisieren
if (p.final && audioPath && refId) {
setMessages(prev => prev.map(m =>
m.messageId === refId ? { ...m, audioPath } : m
));
}
}).catch(() => {});
}
// Thinking-Indicator Status von der Bridge
if (message.type === 'agent_activity') {
const activity = (message.payload.activity as string) || 'idle';

View File

@ -16,13 +16,36 @@ import AudioRecorderPlayer, {
OutputFormatAndroidType,
} from 'react-native-audio-recorder-player';
// Base64-Encoder fuer Binary-Strings (Header-Bytes → Base64)
const B64_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';
function btoaSafe(bin: string): string {
let out = '';
const len = bin.length;
for (let i = 0; i < len; i += 3) {
const b1 = bin.charCodeAt(i) & 0xff;
const b2 = i + 1 < len ? bin.charCodeAt(i + 1) & 0xff : 0;
const b3 = i + 2 < len ? bin.charCodeAt(i + 2) & 0xff : 0;
out += B64_CHARS[b1 >> 2];
out += B64_CHARS[((b1 & 0x03) << 4) | (b2 >> 4)];
out += i + 1 < len ? B64_CHARS[((b2 & 0x0f) << 2) | (b3 >> 6)] : '=';
out += i + 2 < len ? B64_CHARS[b3 & 0x3f] : '=';
}
return out;
}
// Native Module fuer Audio-Focus (Ducking/Muten anderer Apps)
const { AudioFocus } = NativeModules as {
const { AudioFocus, PcmStreamPlayer } = NativeModules as {
AudioFocus?: {
requestDuck: () => Promise<boolean>;
requestExclusive: () => Promise<boolean>;
release: () => Promise<boolean>;
};
PcmStreamPlayer?: {
start: (sampleRate: number, channels: number) => Promise<boolean>;
writeChunk: (base64Pcm: string) => Promise<boolean>;
end: () => Promise<boolean>;
stop: () => Promise<boolean>;
};
};
// --- Typen ---
@ -79,6 +102,15 @@ class AudioService {
private speechDetected: boolean = false;
private speechStartTime: number = 0;
// PCM-Stream (XTTS): aktive Session + Cache-Puffer pro messageId
private pcmStreamActive: boolean = false;
private pcmMessageId: string = '';
private pcmSampleRate: number = 24000;
private pcmChannels: number = 1;
private pcmBuffer: string[] = []; // base64-chunks zum spaeteren WAV-Build
private pcmBytesCollected: number = 0;
private readonly PCM_MAX_CACHE_BYTES = 30 * 1024 * 1024; // 30MB
// VAD State
private vadEnabled: boolean = false;
private lastSpeechTime: number = 0;
@ -303,6 +335,141 @@ class AudioService {
}
}
/** Einen PCM-Chunk aus einer audio_pcm Nachricht empfangen und spielen/cachen.
* Gibt bei final=true den Cache-Pfad zurueck (file://) oder '' wenn nicht gecached. */
async handlePcmChunk(payload: {
base64: string;
sampleRate?: number;
channels?: number;
messageId?: string;
chunk?: number;
final?: boolean;
}): Promise<string> {
if (!PcmStreamPlayer) {
console.warn('[Audio] PcmStreamPlayer Native Module nicht verfuegbar');
return '';
}
const messageId = payload.messageId || '';
const sampleRate = payload.sampleRate || 24000;
const channels = payload.channels || 1;
const base64 = payload.base64 || '';
const isFinal = !!payload.final;
// Neuer Stream? (messageId Wechsel oder nicht aktiv)
if (!this.pcmStreamActive || this.pcmMessageId !== messageId) {
// Vorherigen Stream clean beenden (falls da)
if (this.pcmStreamActive) {
try { await PcmStreamPlayer.stop(); } catch {}
// Altes Buffer verwerfen (wurde nicht final — neue Message kam dazwischen)
this.pcmBuffer = [];
this.pcmBytesCollected = 0;
}
this.pcmStreamActive = true;
this.pcmMessageId = messageId;
this.pcmSampleRate = sampleRate;
this.pcmChannels = channels;
this.pcmBuffer = [];
this.pcmBytesCollected = 0;
try {
await PcmStreamPlayer.start(sampleRate, channels);
} catch (err) {
console.error('[Audio] PcmStreamPlayer.start fehlgeschlagen:', err);
this.pcmStreamActive = false;
return '';
}
// Audio-Focus: andere Apps ducken
AudioFocus?.requestDuck().catch(() => {});
}
// Chunk abspielen + cachen
if (base64) {
try { await PcmStreamPlayer.writeChunk(base64); } catch (err) { console.warn('[Audio] writeChunk', err); }
// Buffer fuer Cache sammeln (wenn noch nicht zu gross)
if (messageId && this.pcmBytesCollected < this.PCM_MAX_CACHE_BYTES) {
this.pcmBuffer.push(base64);
// 4 base64-chars ≈ 3 bytes — grobe Schaetzung
this.pcmBytesCollected += Math.floor(base64.length * 0.75);
}
}
if (isFinal) {
// Stream sauber beenden (spielt noch bis Puffer leer ist)
try { await PcmStreamPlayer.end(); } catch {}
this.pcmStreamActive = false;
AudioFocus?.release().catch(() => {});
// Aus gesammelten PCM-Chunks eine WAV-Datei fuer Replay bauen
if (messageId && this.pcmBuffer.length > 0) {
const audioPath = await this._savePcmBufferAsWav(messageId);
this.pcmBuffer = [];
this.pcmBytesCollected = 0;
this.pcmMessageId = '';
return audioPath;
}
this.pcmMessageId = '';
}
return '';
}
/** Gesammelte PCM-Chunks als WAV speichern. Gibt file:// Pfad zurueck. */
private async _savePcmBufferAsWav(messageId: string): Promise<string> {
try {
const dir = `${RNFS.DocumentDirectoryPath}/tts_cache`;
await RNFS.mkdir(dir).catch(() => {});
const path = `${dir}/${messageId}.wav`;
// WAV-Header fuer PCM s16le
const sampleRate = this.pcmSampleRate;
const channels = this.pcmChannels;
const bitsPerSample = 16;
const byteRate = sampleRate * channels * bitsPerSample / 8;
const blockAlign = channels * bitsPerSample / 8;
const dataSize = this.pcmBytesCollected;
const fileSize = 36 + dataSize;
// Header als Base64 (44 bytes)
const header = new Uint8Array(44);
const dv = new DataView(header.buffer);
// "RIFF"
header[0] = 0x52; header[1] = 0x49; header[2] = 0x46; header[3] = 0x46;
dv.setUint32(4, fileSize, true);
// "WAVE"
header[8] = 0x57; header[9] = 0x41; header[10] = 0x56; header[11] = 0x45;
// "fmt "
header[12] = 0x66; header[13] = 0x6d; header[14] = 0x74; header[15] = 0x20;
dv.setUint32(16, 16, true); // fmt chunk size
dv.setUint16(20, 1, true); // PCM format
dv.setUint16(22, channels, true);
dv.setUint32(24, sampleRate, true);
dv.setUint32(28, byteRate, true);
dv.setUint16(32, blockAlign, true);
dv.setUint16(34, bitsPerSample, true);
// "data"
header[36] = 0x64; header[37] = 0x61; header[38] = 0x74; header[39] = 0x61;
dv.setUint32(40, dataSize, true);
// Header als base64
let headerB64 = '';
const chunk = 1024;
for (let i = 0; i < header.length; i += chunk) {
headerB64 += String.fromCharCode(...Array.from(header.slice(i, i + chunk)));
}
headerB64 = btoaSafe(headerB64);
// Datei schreiben: Header + alle PCM-Chunks
await RNFS.writeFile(path, headerB64, 'base64');
for (const b64 of this.pcmBuffer) {
await RNFS.appendFile(path, b64, 'base64');
}
console.log(`[Audio] PCM-Cache geschrieben: ${path} (${(dataSize / 1024).toFixed(0)}KB, ${this.pcmBuffer.length} chunks)`);
return `file://${path}`;
} catch (err) {
console.warn('[Audio] _savePcmBufferAsWav fehlgeschlagen:', err);
return '';
}
}
/** Audio aus lokaler Datei (file:// Pfad) in die Queue und abspielen. */
async playFromPath(filePath: string): Promise<void> {
if (!filePath) return;
@ -419,6 +586,14 @@ class AudioService {
if (this.preloadedPath) RNFS.unlink(this.preloadedPath).catch(() => {});
this.preloadedPath = '';
}
// PCM-Stream ebenfalls hart stoppen (Cancel/Abbruch)
if (this.pcmStreamActive) {
PcmStreamPlayer?.stop().catch(() => {});
this.pcmStreamActive = false;
this.pcmBuffer = [];
this.pcmBytesCollected = 0;
this.pcmMessageId = '';
}
// Audio-Focus freigeben
AudioFocus?.release().catch(() => {});
}

View File

@ -1296,19 +1296,41 @@ class ARIABridge:
await self._emit_activity("idle", "")
return
elif msg_type == "audio_pcm":
# XTTS-PCM-Stream vom Gaming-PC empfangen → durchleiten zur App.
# Wenn in payload kein messageId (alte XTTS-Bridge), aus requestId auflösen.
error = payload.get("error", "")
if error:
logger.warning("[rvs] XTTS PCM-Fehler: %s", error)
return
linked_message_id = payload.get("messageId", "")
if not linked_message_id:
req_id_full = payload.get("requestId", "")
req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
# Einfach 1:1 weiterleiten mit eingefuellter messageId
forwarded = dict(payload)
forwarded["messageId"] = linked_message_id
await self._send_to_rvs({
"type": "audio_pcm",
"payload": forwarded,
"timestamp": int(asyncio.get_event_loop().time() * 1000),
})
return
elif msg_type == "xtts_response":
# XTTS-Audio vom Gaming-PC empfangen → an App weiterleiten
# Legacy-Pfad (alte XTTS-Bridge mit WAV-Response). Weiterleiten als
# type "audio" — App nutzt den bestehenden WAV-Queue-Spieler.
audio_b64 = payload.get("base64", "")
error = payload.get("error", "")
req_id_full = payload.get("requestId", "")
# XTTS-Bridge suffixt chunkweise: "uuid_0", "uuid_1" → Basis-UUID extrahieren
req_id_base = req_id_full.rsplit("_", 1)[0] if "_" in req_id_full else req_id_full
linked_message_id = self._xtts_request_to_message.get(req_id_base, "")
if error:
logger.warning("[rvs] XTTS Fehler: %s", error)
return
if audio_b64:
logger.info("[rvs] XTTS-Audio empfangen: %dKB", len(audio_b64) // 1365)
logger.info("[rvs] XTTS-Audio legacy empfangen: %dKB", len(audio_b64) // 1365)
await self._send_to_rvs({
"type": "audio",
"payload": {

View File

@ -17,6 +17,7 @@ const ALLOWED_TYPES = new Set([
"xtts_request", "xtts_response", "xtts_list_voices", "xtts_voices_list", "voice_upload", "xtts_voice_saved",
"update_check", "update_available", "update_download", "update_data",
"agent_activity", "cancel_request",
"audio_pcm",
]);
// Token-Raum: token -> { clients: Set<ws> }

View File

@ -94,34 +94,33 @@ function connectRVS(forcePlain) {
// ── TTS Request Handler ─────────────────────────────
async function handleTTSRequest(payload) {
const { text, voice, requestId, language } = payload;
const { text, voice, requestId, language, messageId } = payload;
if (!text) return;
// Markdown + Sonderzeichen entfernen fuer natuerliche Sprache
// Markdown-Cleanup (Bridge macht jetzt auch Cleanup, aber safety net)
let cleanText = text
.replace(/\*\*([^*]+)\*\*/g, "$1") // **fett** → fett
.replace(/\*([^*]+)\*/g, "$1") // *kursiv* → kursiv
.replace(/`([^`]+)`/g, "$1") // `code` → code
.replace(/```[\s\S]*?```/g, "") // Code-Bloecke entfernen
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") // [text](url) → text
.replace(/#{1,6}\s*/g, "") // ### Ueberschriften → entfernen
.replace(/>\s*/g, "") // > Zitate → entfernen
.replace(/[-*]\s+/g, "") // - Listen → entfernen
.replace(/\n{2,}/g, ". ") // Mehrere Newlines → Punkt
.replace(/\n/g, ", ") // Einzelne Newlines → Komma
.replace(/\s{2,}/g, " ") // Mehrfach-Leerzeichen
.replace(/["""„]/g, "") // Anfuehrungszeichen entfernen
.replace(/\(\)/g, "") // Leere Klammern
.replace(/\*\*([^*]+)\*\*/g, "$1")
.replace(/\*([^*]+)\*/g, "$1")
.replace(/`([^`]+)`/g, "$1")
.replace(/```[\s\S]*?```/g, "")
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
.replace(/#{1,6}\s*/g, "")
.replace(/>\s*/g, "")
.replace(/[-*]\s+/g, "")
.replace(/\n{2,}/g, ". ")
.replace(/\n/g, ", ")
.replace(/\s{2,}/g, " ")
.replace(/["""„]/g, "")
.replace(/\(\)/g, "")
.trim();
// Text in Saetze aufteilen, dann zu Chunks von 2-3 Saetzen zusammenfassen
// (mehr Kontext = konsistentere Stimme/Lautstaerke, aber nicht zu lang fuer WebSocket)
// Satzweise Chunks (XTTS Modell laedt Context pro Call — Saetze gruppieren)
const sentences = cleanText.split(/(?<=[.!?])\s+/)
.map(s => s.trim())
.filter(s => s.length > 0)
.map(s => s.replace(/[.]+$/, '')); // Punkt am Ende entfernen
.map(s => s.replace(/[.]+$/, ''));
const MAX_CHUNK_CHARS = 150; // Max ~150 Zeichen pro Chunk (schnelles Rendering, Preloading reicht)
const MAX_CHUNK_CHARS = 150;
const chunks = [];
let currentChunk = '';
for (const sentence of sentences) {
@ -135,45 +134,70 @@ async function handleTTSRequest(payload) {
if (currentChunk) chunks.push(currentChunk);
if (chunks.length === 0) return;
log(`TTS-Request: "${cleanText.slice(0, 60)}..." (${sentences.length} Saetze → ${chunks.length} Chunks, voice: ${voice || "default"}, lang: ${language || "de"})`);
log(`TTS-Request (streaming): "${cleanText.slice(0, 60)}..." (${chunks.length} Chunks, voice: ${voice || "default"})`);
try {
const voiceSample = voice ? path.join(VOICES_DIR, `${voice}.wav`) : null;
const hasCustomVoice = voiceSample && fs.existsSync(voiceSample);
// Streaming: Chunk rendern → sofort senden → naechster Chunk
// App spielt mit Preloading-Queue nahtlos ab
let sentCount = 0;
let chunkIndex = 0;
// Audio-Format (aus WAV-Header extrahiert, einmal pro Request)
let pcmMeta = null;
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
const isLastChunk = i === chunks.length - 1;
try {
const audioBuffer = await callXTTSAPI(chunk, language || "de", hasCustomVoice ? voiceSample : null);
if (audioBuffer && audioBuffer.length > 100) {
log(`TTS [${i + 1}/${chunks.length}]: ${(audioBuffer.length / 1024).toFixed(0)}KB — "${chunk.slice(0, 50)}"`);
// Streaming: PCM-Frames werden nacheinander an RVS gepusht,
// sobald sie vom XTTS-Server reinkommen
await streamXTTSAsPCM(
chunk,
language || "de",
hasCustomVoice ? voiceSample : null,
(pcmBase64, meta) => {
if (!pcmMeta) pcmMeta = meta;
sendToRVS({
type: "audio_pcm",
payload: {
requestId: requestId || "",
messageId: messageId || "",
base64: pcmBase64,
format: "pcm_s16le",
sampleRate: meta.sampleRate,
channels: meta.channels,
voice: voice || "default",
chunk: chunkIndex++,
final: false,
},
timestamp: Date.now(),
});
},
);
// Nach letztem Text-Chunk: final-Flag senden damit App weiss "fertig"
if (isLastChunk && pcmMeta) {
sendToRVS({
type: "xtts_response",
type: "audio_pcm",
payload: {
requestId: `${requestId || ""}_${i}`,
base64: audioBuffer.toString("base64"),
mimeType: "audio/wav",
requestId: requestId || "",
messageId: messageId || "",
base64: "",
format: "pcm_s16le",
sampleRate: pcmMeta.sampleRate,
channels: pcmMeta.channels,
voice: voice || "default",
engine: "xtts",
part: i + 1,
totalParts: chunks.length,
chunk: chunkIndex++,
final: true,
},
timestamp: Date.now(),
});
sentCount++;
}
} catch (chunkErr) {
log(`TTS [${i + 1}/${chunks.length}] Fehler: ${chunkErr.message} — ueberspringe`);
}
}
log(`TTS komplett: ${sentCount}/${chunks.length} Chunks gestreamt`);
log(`TTS komplett: ${chunkIndex} PCM-Frames gestreamt (${cleanText.length} chars)`);
} catch (err) {
log(`TTS Fehler: ${err.message}`);
sendToRVS({
@ -184,7 +208,19 @@ async function handleTTSRequest(payload) {
}
}
function callXTTSAPI(text, language, speakerWav) {
/**
* Ruft /tts_to_audio/ auf und streamt das resultierende WAV bereits waehrend
* des Empfangs in PCM-Frames an den Callback. Der WAV-Header wird einmal
* geparst, danach werden nur noch raw PCM-Samples weitergeleitet.
*
* Warum nicht echtes /tts_stream/? daswer123 hat den Endpoint, aber die
* Audio-Quality ist dort niedriger und er produziert beim ersten Chunk
* oft Artefakte. Pragmatischer Weg: /tts_to_audio/ + Response-Stream
* chunkweise auslesen. Das ist zwar kein echtes Server-Streaming, aber
* gibt uns deutlich kleinere Netzwerk-Haeppchen und die App kann via
* AudioTrack MODE_STREAM sofort nahtlos abspielen.
*/
function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
return new Promise((resolve, reject) => {
const body = JSON.stringify({
text,
@ -206,15 +242,59 @@ function callXTTSAPI(text, language, speakerWav) {
};
const req = http.request(options, (res) => {
const chunks = [];
res.on("data", (chunk) => chunks.push(chunk));
res.on("end", () => {
if (res.statusCode === 200) {
resolve(Buffer.concat(chunks));
} else {
reject(new Error(`XTTS API HTTP ${res.statusCode}: ${Buffer.concat(chunks).toString().slice(0, 200)}`));
if (res.statusCode !== 200) {
let body = "";
res.on("data", (d) => { body += d.toString(); });
res.on("end", () => reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`)));
return;
}
let headerParsed = false;
let sampleRate = 24000;
let channels = 1;
let leftover = Buffer.alloc(0); // ungerade Byte-Reste fuer das naechste Chunk
const HEADER_BYTES = 44;
let headerBuf = Buffer.alloc(0);
const PCM_CHUNK_BYTES = 8192; // ~170ms bei 24kHz s16 mono
res.on("data", (chunk) => {
let data = chunk;
// WAV-Header konsumieren (44 Bytes)
if (!headerParsed) {
headerBuf = Buffer.concat([headerBuf, data]);
if (headerBuf.length < HEADER_BYTES) return;
// Header lesen
const header = headerBuf.slice(0, HEADER_BYTES);
try {
channels = header.readUInt16LE(22);
sampleRate = header.readUInt32LE(24);
} catch (_) {}
headerParsed = true;
data = headerBuf.slice(HEADER_BYTES);
}
// leftover aus vorherigem Chunk + neuer data
let combined = Buffer.concat([leftover, data]);
// In PCM_CHUNK_BYTES-Happen zerlegen (Vielfache von 2 damit keine Sample-Splits)
while (combined.length >= PCM_CHUNK_BYTES) {
const slice = combined.slice(0, PCM_CHUNK_BYTES);
combined = combined.slice(PCM_CHUNK_BYTES);
onPcmChunk(slice.toString("base64"), { sampleRate, channels });
}
leftover = combined;
});
res.on("end", () => {
// Rest-Daten senden
if (leftover.length > 0) {
onPcmChunk(leftover.toString("base64"), { sampleRate, channels });
}
resolve();
});
res.on("error", reject);
});
req.on("error", reject);