Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 31fe70bab5 | |||
| 39251b3d32 | |||
| 0623de32a0 | |||
| cd5e6e7ee6 | |||
| ee3e0a0af6 | |||
| 0783b1b99d | |||
| 5492c7a46f | |||
| 4cbe184faa | |||
| 647a1cb726 | |||
| 73263b69a6 | |||
| c62ceafdc2 | |||
| 9b5a35cb4a | |||
| 5ac1a0a522 | |||
| a28b46a809 | |||
| 59c8d36a3d |
@@ -79,8 +79,8 @@ android {
|
||||
applicationId "com.ariacockpit"
|
||||
minSdkVersion rootProject.ext.minSdkVersion
|
||||
targetSdkVersion rootProject.ext.targetSdkVersion
|
||||
versionCode 403
|
||||
versionName "0.0.4.3"
|
||||
versionCode 405
|
||||
versionName "0.0.4.5"
|
||||
// Fallback fuer Libraries mit Product Flavors
|
||||
missingDimensionStrategy 'react-native-camera', 'general'
|
||||
}
|
||||
|
||||
@@ -13,22 +13,26 @@ import com.facebook.react.bridge.ReactMethod
|
||||
import java.util.concurrent.LinkedBlockingQueue
|
||||
|
||||
/**
|
||||
* Streamt PCM-s16le Audio direkt via AudioTrack MODE_STREAM.
|
||||
* Streamt PCM-s16le Audio direkt via AudioTrack MODE_STREAM mit Pre-Roll.
|
||||
*
|
||||
* Pre-Roll: AudioTrack wird zwar direkt gebaut und gefuttert, aber play()
|
||||
* wird erst aufgerufen wenn PREROLL_SECONDS Audio im Buffer ist. So hat
|
||||
* der Stream Zeit einen Vorrat aufzubauen — wenn XTTS mit RTF>1 rendert
|
||||
* (langsamer als Echtzeit), laeuft der Buffer trotzdem nicht leer.
|
||||
*
|
||||
* Flow:
|
||||
* JS: start(sampleRate, channels) → öffnet AudioTrack und startet Writer-Thread
|
||||
* JS: writeChunk(base64) → dekodiert, queued, Writer schreibt non-blocking
|
||||
* JS: end() → wartet bis Queue leer, schließt AudioTrack
|
||||
* JS: stop() → Hart stoppen, Queue leeren (Cancel)
|
||||
*
|
||||
* Vorteil gegenüber Sound-File-Queue:
|
||||
* - Keine Gap zwischen Chunks (AudioTrack puffert intern)
|
||||
* - Erste Samples beginnen zu spielen sobald der erste Chunk da ist
|
||||
* - Kein WAV-Header-Parsing pro Chunk
|
||||
* JS: start(sampleRate, channels) → öffnet AudioTrack (noch nicht play())
|
||||
* JS: writeChunk(base64) → dekodiert, queued, Writer schreibt
|
||||
* Writer: spielt los sobald PREROLL erreicht ist
|
||||
* JS: end() → wartet bis Queue leer, schließt
|
||||
* JS: stop() → Hart stoppen (Cancel)
|
||||
*/
|
||||
class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||
companion object {
|
||||
private const val TAG = "PcmStreamPlayer"
|
||||
// Sekunden Audio die VOR play()-Start gepuffert sein muessen.
|
||||
// 2.5s Vorrat = genug um XTTS-Render-Pausen zwischen Chunks zu puffern.
|
||||
private const val PREROLL_SECONDS = 2.5
|
||||
}
|
||||
|
||||
override fun getName() = "PcmStreamPlayer"
|
||||
@@ -38,6 +42,9 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
||||
private var writerThread: Thread? = null
|
||||
@Volatile private var writerShouldStop = false
|
||||
@Volatile private var endRequested = false
|
||||
@Volatile private var prerollBytes: Int = 0
|
||||
@Volatile private var playbackStarted = false
|
||||
@Volatile private var bytesBuffered: Long = 0
|
||||
|
||||
// ── Lifecycle ──
|
||||
|
||||
@@ -50,10 +57,13 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
||||
val channelConfig = if (channels == 2) AudioFormat.CHANNEL_OUT_STEREO else AudioFormat.CHANNEL_OUT_MONO
|
||||
val encoding = AudioFormat.ENCODING_PCM_16BIT
|
||||
val minBuf = AudioTrack.getMinBufferSize(sampleRate, channelConfig, encoding)
|
||||
// Grosszuegiger Buffer: 32x MinSize — tolerant gegen Netzwerk-Jitter und
|
||||
// bursty XTTS-Delivery (Render dauert 1-3s, dann kommen alle Samples
|
||||
// auf einmal). Bei 24kHz mono s16 entspricht 128KB ca. 2.7 Sekunden.
|
||||
val bufferSize = (minBuf * 32).coerceAtLeast(128 * 1024)
|
||||
val bytesPerSecond = sampleRate * channels * 2 // 16-bit = 2 bytes
|
||||
// Buffer muss mindestens PREROLL + etwas Spielraum fassen.
|
||||
val prerollTarget = (bytesPerSecond * PREROLL_SECONDS).toInt()
|
||||
val bufferSize = (minBuf * 32).coerceAtLeast(prerollTarget * 2)
|
||||
prerollBytes = prerollTarget
|
||||
bytesBuffered = 0
|
||||
playbackStarted = false
|
||||
|
||||
val newTrack = AudioTrack.Builder()
|
||||
.setAudioAttributes(
|
||||
@@ -73,7 +83,7 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
||||
.setTransferMode(AudioTrack.MODE_STREAM)
|
||||
.build()
|
||||
|
||||
newTrack.play()
|
||||
// AudioTrack erstellen — play() wird erst aufgerufen wenn Pre-Roll erreicht.
|
||||
track = newTrack
|
||||
queue.clear()
|
||||
writerShouldStop = false
|
||||
@@ -84,15 +94,35 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
||||
try {
|
||||
while (!writerShouldStop) {
|
||||
val data = queue.poll(50, java.util.concurrent.TimeUnit.MILLISECONDS) ?: run {
|
||||
if (endRequested) return@Thread
|
||||
if (endRequested) {
|
||||
// Falls wir vor Pre-Roll enden (kurzer Text): trotzdem abspielen
|
||||
if (!playbackStarted) {
|
||||
try { t.play() } catch (_: Exception) {}
|
||||
playbackStarted = true
|
||||
}
|
||||
return@Thread
|
||||
}
|
||||
null
|
||||
} ?: continue
|
||||
|
||||
// Pre-Roll Check: play() erst wenn genug gepuffert
|
||||
if (!playbackStarted && bytesBuffered + data.size >= prerollBytes) {
|
||||
try {
|
||||
t.play()
|
||||
playbackStarted = true
|
||||
Log.i(TAG, "Playback gestartet nach Pre-Roll ${bytesBuffered + data.size} Bytes")
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "play() failed: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
var offset = 0
|
||||
while (offset < data.size && !writerShouldStop) {
|
||||
val written = t.write(data, offset, data.size - offset)
|
||||
if (written <= 0) break
|
||||
offset += written
|
||||
}
|
||||
bytesBuffered += data.size
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "Writer-Thread Fehler: ${e.message}")
|
||||
@@ -102,7 +132,7 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
||||
}
|
||||
}, "PcmStreamWriter").apply { start() }
|
||||
|
||||
Log.i(TAG, "Stream gestartet: ${sampleRate}Hz ch=$channels buf=${bufferSize}B")
|
||||
Log.i(TAG, "Stream gestartet: ${sampleRate}Hz ch=$channels buf=${bufferSize}B preroll=${prerollBytes}B")
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "start fehlgeschlagen", e)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "aria-cockpit",
|
||||
"version": "0.0.4.3",
|
||||
"version": "0.0.4.5",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"android": "react-native run-android",
|
||||
|
||||
+134
-49
@@ -138,31 +138,43 @@ async function _runTTSRequest(payload) {
|
||||
let chunkIndex = 0;
|
||||
let pcmMeta = null;
|
||||
|
||||
// EIN Request fuer den GANZEN Text — kein Gap zwischen Saetzen.
|
||||
// XTTS rendert und wir streamen PCM sobald es reinkommt.
|
||||
await streamXTTSAsPCM(
|
||||
cleanText,
|
||||
language || "de",
|
||||
hasCustomVoice ? voiceSample : null,
|
||||
(pcmBase64, meta) => {
|
||||
if (!pcmMeta) pcmMeta = meta;
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: pcmBase64,
|
||||
format: "pcm_s16le",
|
||||
sampleRate: meta.sampleRate,
|
||||
channels: meta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: false,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
},
|
||||
);
|
||||
const onChunk = (pcmBase64, meta) => {
|
||||
if (!pcmMeta) pcmMeta = meta;
|
||||
sendToRVS({
|
||||
type: "audio_pcm",
|
||||
payload: {
|
||||
requestId: requestId || "",
|
||||
messageId: messageId || "",
|
||||
base64: pcmBase64,
|
||||
format: "pcm_s16le",
|
||||
sampleRate: meta.sampleRate,
|
||||
channels: meta.channels,
|
||||
voice: voice || "default",
|
||||
chunk: chunkIndex++,
|
||||
final: false,
|
||||
},
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
};
|
||||
|
||||
// /tts_stream fuer echtes Streaming (funktioniert im XTTS local-Mode).
|
||||
// Wenn Server im apiManual/api-Mode laeuft: 400 → Fallback auf /tts_to_audio/.
|
||||
try {
|
||||
await streamXTTSAsPCM(
|
||||
cleanText,
|
||||
language || "de",
|
||||
hasCustomVoice ? voiceSample : null,
|
||||
onChunk,
|
||||
);
|
||||
} catch (streamErr) {
|
||||
log(`/tts_stream fehlgeschlagen (${streamErr.message.slice(0, 100)}) — Fallback /tts_to_audio/`);
|
||||
await streamXTTSBatch(
|
||||
cleanText,
|
||||
language || "de",
|
||||
hasCustomVoice ? voiceSample : null,
|
||||
onChunk,
|
||||
);
|
||||
}
|
||||
|
||||
// Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
|
||||
if (pcmMeta) {
|
||||
@@ -195,45 +207,48 @@ async function _runTTSRequest(payload) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Ruft /tts_to_audio/ auf und streamt das resultierende WAV bereits waehrend
|
||||
* des Empfangs in PCM-Frames an den Callback. Der WAV-Header wird einmal
|
||||
* geparst, danach werden nur noch raw PCM-Samples weitergeleitet.
|
||||
*
|
||||
* Warum nicht echtes /tts_stream/? daswer123 hat den Endpoint, aber die
|
||||
* Audio-Quality ist dort niedriger und er produziert beim ersten Chunk
|
||||
* oft Artefakte. Pragmatischer Weg: /tts_to_audio/ + Response-Stream
|
||||
* chunkweise auslesen. Das ist zwar kein echtes Server-Streaming, aber
|
||||
* gibt uns deutlich kleinere Netzwerk-Haeppchen und die App kann via
|
||||
* AudioTrack MODE_STREAM sofort nahtlos abspielen.
|
||||
* Ruft /tts_stream auf — echter Streaming-Endpoint bei daswer123.
|
||||
* Schickt was der Server verlangt (allow: GET), aber mit JSON-Body
|
||||
* als POST scheitert mit 405. Manche Versionen wollen GET + Query,
|
||||
* andere POST + JSON. Testen was funktioniert.
|
||||
*/
|
||||
function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const body = JSON.stringify({
|
||||
text,
|
||||
language,
|
||||
speaker_wav: speakerWav || "",
|
||||
});
|
||||
// Wichtig: speaker_wav MUSS als Query-Key dabei sein (Pydantic required) —
|
||||
// auch bei default-voice mit leerem Wert. Sonst gibt's HTTP 422.
|
||||
// stream_chunk_size=100: Kompromiss zwischen first-audio-latency und
|
||||
// gap-risk. Bei RTX 3060 (RTF 1.48) ~3s bis erster Audio, Chunks gross
|
||||
// genug dass der AudioTrack-Buffer (128KB ≈ 2.7s) zwischen Chunks nicht
|
||||
// leerlauft.
|
||||
const qs = new URLSearchParams();
|
||||
qs.set("text", text);
|
||||
qs.set("language", language || "de");
|
||||
qs.set("speaker_wav", speakerWav || "");
|
||||
qs.set("stream_chunk_size", "100");
|
||||
|
||||
const url = new URL(`${XTTS_API_URL}/tts_to_audio/`);
|
||||
const url = new URL(XTTS_API_URL);
|
||||
const fullPath = `/tts_stream?${qs.toString()}`;
|
||||
const options = {
|
||||
hostname: url.hostname,
|
||||
port: url.port,
|
||||
path: url.pathname,
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"Content-Length": Buffer.byteLength(body),
|
||||
},
|
||||
port: url.port || 80,
|
||||
path: fullPath,
|
||||
method: "GET",
|
||||
timeout: 60000,
|
||||
};
|
||||
|
||||
log(`TTS GET /tts_stream?text=${text.slice(0, 30)}... (voice=${speakerWav ? "custom" : "default"})`);
|
||||
|
||||
const req = http.request(options, (res) => {
|
||||
if (res.statusCode !== 200) {
|
||||
let body = "";
|
||||
res.on("data", (d) => { body += d.toString(); });
|
||||
res.on("end", () => reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`)));
|
||||
res.on("end", () => {
|
||||
log(`XTTS /tts_stream ${res.statusCode}: ${body.slice(0, 300)}`);
|
||||
reject(new Error(`XTTS HTTP ${res.statusCode}: ${body.slice(0, 200)}`));
|
||||
});
|
||||
return;
|
||||
}
|
||||
log(`TTS stream verbunden, empfange PCM...`);
|
||||
|
||||
let headerParsed = false;
|
||||
let sampleRate = 24000;
|
||||
@@ -285,6 +300,76 @@ function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
|
||||
|
||||
req.on("error", reject);
|
||||
req.on("timeout", () => { req.destroy(); reject(new Error("XTTS API Timeout (60s)")); });
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallback: /tts_to_audio/ (POST JSON) — rendert komplett, dann response.
|
||||
* Kein echtes Streaming, aber stabil als Backup wenn /tts_stream nicht geht.
|
||||
* Shared chunking-Logik mit streamXTTSAsPCM — parst WAV-Header, stueckelt PCM.
|
||||
*/
|
||||
function streamXTTSBatch(text, language, speakerWav, onPcmChunk) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const body = JSON.stringify({
|
||||
text,
|
||||
language: language || "de",
|
||||
speaker_wav: speakerWav || "",
|
||||
});
|
||||
const url = new URL(XTTS_API_URL);
|
||||
const options = {
|
||||
hostname: url.hostname,
|
||||
port: url.port || 80,
|
||||
path: "/tts_to_audio/",
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"Content-Length": Buffer.byteLength(body),
|
||||
},
|
||||
timeout: 60000,
|
||||
};
|
||||
|
||||
const req = http.request(options, (res) => {
|
||||
if (res.statusCode !== 200) {
|
||||
let rb = "";
|
||||
res.on("data", (d) => { rb += d.toString(); });
|
||||
res.on("end", () => reject(new Error(`XTTS Batch HTTP ${res.statusCode}: ${rb.slice(0, 200)}`)));
|
||||
return;
|
||||
}
|
||||
let headerParsed = false;
|
||||
let sampleRate = 24000;
|
||||
let channels = 1;
|
||||
let leftover = Buffer.alloc(0);
|
||||
let headerBuf = Buffer.alloc(0);
|
||||
const HEADER_BYTES = 44;
|
||||
const PCM_CHUNK_BYTES = 8192;
|
||||
|
||||
res.on("data", (chunk) => {
|
||||
let data = chunk;
|
||||
if (!headerParsed) {
|
||||
headerBuf = Buffer.concat([headerBuf, data]);
|
||||
if (headerBuf.length < HEADER_BYTES) return;
|
||||
const header = headerBuf.slice(0, HEADER_BYTES);
|
||||
try { channels = header.readUInt16LE(22); sampleRate = header.readUInt32LE(24); } catch (_) {}
|
||||
headerParsed = true;
|
||||
data = headerBuf.slice(HEADER_BYTES);
|
||||
}
|
||||
let combined = Buffer.concat([leftover, data]);
|
||||
while (combined.length >= PCM_CHUNK_BYTES) {
|
||||
const slice = combined.slice(0, PCM_CHUNK_BYTES);
|
||||
combined = combined.slice(PCM_CHUNK_BYTES);
|
||||
onPcmChunk(slice.toString("base64"), { sampleRate, channels });
|
||||
}
|
||||
leftover = combined;
|
||||
});
|
||||
res.on("end", () => {
|
||||
if (leftover.length > 0) onPcmChunk(leftover.toString("base64"), { sampleRate, channels });
|
||||
resolve();
|
||||
});
|
||||
res.on("error", reject);
|
||||
});
|
||||
req.on("error", reject);
|
||||
req.on("timeout", () => { req.destroy(); reject(new Error("XTTS Batch Timeout (60s)")); });
|
||||
req.write(body);
|
||||
req.end();
|
||||
});
|
||||
|
||||
@@ -33,6 +33,12 @@ services:
|
||||
- ./voices:/voices # Custom Voice Samples
|
||||
environment:
|
||||
- COQUI_TOS_AGREED=1
|
||||
# Local-Modus statt default "apiManual": Modell bleibt im GPU-VRAM,
|
||||
# Render startet sofort, /tts_stream funktioniert.
|
||||
# Default-CMD des Images liest diese ENV: -ms ${MODEL_SOURCE:-"apiManual"}
|
||||
- MODEL_SOURCE=local
|
||||
# Speaker-Folder auf unsere gemounteten voices zeigen lassen
|
||||
- EXAMPLE_FOLDER=/voices
|
||||
restart: unless-stopped
|
||||
|
||||
# ─── XTTS Bridge (verbindet zu RVS) ───────────
|
||||
|
||||
Reference in New Issue
Block a user