Compare commits
10 Commits
5492c7a46f
...
v0.0.4.7
| Author | SHA1 | Date | |
|---|---|---|---|
| bbbe69d928 | |||
| 23c39d5bba | |||
| 5328dc8595 | |||
| 0c03b4f161 | |||
| 31fe70bab5 | |||
| 39251b3d32 | |||
| 0623de32a0 | |||
| cd5e6e7ee6 | |||
| ee3e0a0af6 | |||
| 0783b1b99d |
@@ -79,8 +79,8 @@ android {
|
|||||||
applicationId "com.ariacockpit"
|
applicationId "com.ariacockpit"
|
||||||
minSdkVersion rootProject.ext.minSdkVersion
|
minSdkVersion rootProject.ext.minSdkVersion
|
||||||
targetSdkVersion rootProject.ext.targetSdkVersion
|
targetSdkVersion rootProject.ext.targetSdkVersion
|
||||||
versionCode 404
|
versionCode 407
|
||||||
versionName "0.0.4.4"
|
versionName "0.0.4.7"
|
||||||
// Fallback fuer Libraries mit Product Flavors
|
// Fallback fuer Libraries mit Product Flavors
|
||||||
missingDimensionStrategy 'react-native-camera', 'general'
|
missingDimensionStrategy 'react-native-camera', 'general'
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,22 +13,29 @@ import com.facebook.react.bridge.ReactMethod
|
|||||||
import java.util.concurrent.LinkedBlockingQueue
|
import java.util.concurrent.LinkedBlockingQueue
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Streamt PCM-s16le Audio direkt via AudioTrack MODE_STREAM.
|
* Streamt PCM-s16le Audio direkt via AudioTrack MODE_STREAM mit Pre-Roll.
|
||||||
|
*
|
||||||
|
* Pre-Roll: AudioTrack wird zwar direkt gebaut und gefuttert, aber play()
|
||||||
|
* wird erst aufgerufen wenn PREROLL_SECONDS Audio im Buffer ist. So hat
|
||||||
|
* der Stream Zeit einen Vorrat aufzubauen — wenn XTTS mit RTF>1 rendert
|
||||||
|
* (langsamer als Echtzeit), laeuft der Buffer trotzdem nicht leer.
|
||||||
*
|
*
|
||||||
* Flow:
|
* Flow:
|
||||||
* JS: start(sampleRate, channels) → öffnet AudioTrack und startet Writer-Thread
|
* JS: start(sampleRate, channels) → öffnet AudioTrack (noch nicht play())
|
||||||
* JS: writeChunk(base64) → dekodiert, queued, Writer schreibt non-blocking
|
* JS: writeChunk(base64) → dekodiert, queued, Writer schreibt
|
||||||
* JS: end() → wartet bis Queue leer, schließt AudioTrack
|
* Writer: spielt los sobald PREROLL erreicht ist
|
||||||
* JS: stop() → Hart stoppen, Queue leeren (Cancel)
|
* JS: end() → wartet bis Queue leer, schließt
|
||||||
*
|
* JS: stop() → Hart stoppen (Cancel)
|
||||||
* Vorteil gegenüber Sound-File-Queue:
|
|
||||||
* - Keine Gap zwischen Chunks (AudioTrack puffert intern)
|
|
||||||
* - Erste Samples beginnen zu spielen sobald der erste Chunk da ist
|
|
||||||
* - Kein WAV-Header-Parsing pro Chunk
|
|
||||||
*/
|
*/
|
||||||
class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||||
companion object {
|
companion object {
|
||||||
private const val TAG = "PcmStreamPlayer"
|
private const val TAG = "PcmStreamPlayer"
|
||||||
|
// Sekunden Audio die VOR play()-Start gepuffert sein muessen.
|
||||||
|
// 2.5s Vorrat = genug um XTTS-Render-Pausen zwischen Chunks zu puffern.
|
||||||
|
private const val PREROLL_SECONDS = 2.5
|
||||||
|
// Stille am Stream-Anfang, damit AudioTrack sauber anfaehrt und die
|
||||||
|
// ersten Samples nicht abgeschnitten werden (XTTS-Warmup + play()-Latenz).
|
||||||
|
private const val LEADING_SILENCE_SECONDS = 0.2
|
||||||
}
|
}
|
||||||
|
|
||||||
override fun getName() = "PcmStreamPlayer"
|
override fun getName() = "PcmStreamPlayer"
|
||||||
@@ -38,6 +45,10 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
|||||||
private var writerThread: Thread? = null
|
private var writerThread: Thread? = null
|
||||||
@Volatile private var writerShouldStop = false
|
@Volatile private var writerShouldStop = false
|
||||||
@Volatile private var endRequested = false
|
@Volatile private var endRequested = false
|
||||||
|
@Volatile private var prerollBytes: Int = 0
|
||||||
|
@Volatile private var playbackStarted = false
|
||||||
|
@Volatile private var bytesBuffered: Long = 0
|
||||||
|
@Volatile private var streamBytesPerFrame: Int = 2 // mono s16le default
|
||||||
|
|
||||||
// ── Lifecycle ──
|
// ── Lifecycle ──
|
||||||
|
|
||||||
@@ -50,10 +61,14 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
|||||||
val channelConfig = if (channels == 2) AudioFormat.CHANNEL_OUT_STEREO else AudioFormat.CHANNEL_OUT_MONO
|
val channelConfig = if (channels == 2) AudioFormat.CHANNEL_OUT_STEREO else AudioFormat.CHANNEL_OUT_MONO
|
||||||
val encoding = AudioFormat.ENCODING_PCM_16BIT
|
val encoding = AudioFormat.ENCODING_PCM_16BIT
|
||||||
val minBuf = AudioTrack.getMinBufferSize(sampleRate, channelConfig, encoding)
|
val minBuf = AudioTrack.getMinBufferSize(sampleRate, channelConfig, encoding)
|
||||||
// Grosszuegiger Buffer: 32x MinSize — tolerant gegen Netzwerk-Jitter und
|
val bytesPerSecond = sampleRate * channels * 2 // 16-bit = 2 bytes
|
||||||
// bursty XTTS-Delivery (Render dauert 1-3s, dann kommen alle Samples
|
// Buffer muss mindestens PREROLL + etwas Spielraum fassen.
|
||||||
// auf einmal). Bei 24kHz mono s16 entspricht 128KB ca. 2.7 Sekunden.
|
val prerollTarget = (bytesPerSecond * PREROLL_SECONDS).toInt()
|
||||||
val bufferSize = (minBuf * 32).coerceAtLeast(128 * 1024)
|
val bufferSize = (minBuf * 32).coerceAtLeast(prerollTarget * 2)
|
||||||
|
prerollBytes = prerollTarget
|
||||||
|
bytesBuffered = 0
|
||||||
|
playbackStarted = false
|
||||||
|
streamBytesPerFrame = channels * 2 // s16 = 2 bytes per sample
|
||||||
|
|
||||||
val newTrack = AudioTrack.Builder()
|
val newTrack = AudioTrack.Builder()
|
||||||
.setAudioAttributes(
|
.setAudioAttributes(
|
||||||
@@ -73,7 +88,7 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
|||||||
.setTransferMode(AudioTrack.MODE_STREAM)
|
.setTransferMode(AudioTrack.MODE_STREAM)
|
||||||
.build()
|
.build()
|
||||||
|
|
||||||
newTrack.play()
|
// AudioTrack erstellen — play() wird erst aufgerufen wenn Pre-Roll erreicht.
|
||||||
track = newTrack
|
track = newTrack
|
||||||
queue.clear()
|
queue.clear()
|
||||||
writerShouldStop = false
|
writerShouldStop = false
|
||||||
@@ -82,27 +97,83 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
|||||||
writerThread = Thread({
|
writerThread = Thread({
|
||||||
val t = track ?: return@Thread
|
val t = track ?: return@Thread
|
||||||
try {
|
try {
|
||||||
|
// Leading-Silence in den Buffer — gibt AudioTrack Zeit anzufahren.
|
||||||
|
val silenceBytes = ((sampleRate * channels * 2) * LEADING_SILENCE_SECONDS).toInt() and 0x7FFFFFFE
|
||||||
|
if (silenceBytes > 0) {
|
||||||
|
val silence = ByteArray(silenceBytes)
|
||||||
|
var silOff = 0
|
||||||
|
while (silOff < silence.size && !writerShouldStop) {
|
||||||
|
val w = t.write(silence, silOff, silence.size - silOff)
|
||||||
|
if (w <= 0) break
|
||||||
|
silOff += w
|
||||||
|
}
|
||||||
|
bytesBuffered += silence.size
|
||||||
|
}
|
||||||
while (!writerShouldStop) {
|
while (!writerShouldStop) {
|
||||||
val data = queue.poll(50, java.util.concurrent.TimeUnit.MILLISECONDS) ?: run {
|
val data = queue.poll(50, java.util.concurrent.TimeUnit.MILLISECONDS) ?: run {
|
||||||
if (endRequested) return@Thread
|
if (endRequested) {
|
||||||
|
// Falls wir vor Pre-Roll enden (kurzer Text): trotzdem abspielen
|
||||||
|
if (!playbackStarted) {
|
||||||
|
try { t.play() } catch (_: Exception) {}
|
||||||
|
playbackStarted = true
|
||||||
|
}
|
||||||
|
return@Thread
|
||||||
|
}
|
||||||
null
|
null
|
||||||
} ?: continue
|
} ?: continue
|
||||||
|
|
||||||
|
// Pre-Roll Check: play() erst wenn genug gepuffert
|
||||||
|
if (!playbackStarted && bytesBuffered + data.size >= prerollBytes) {
|
||||||
|
try {
|
||||||
|
t.play()
|
||||||
|
playbackStarted = true
|
||||||
|
Log.i(TAG, "Playback gestartet nach Pre-Roll ${bytesBuffered + data.size} Bytes")
|
||||||
|
} catch (e: Exception) {
|
||||||
|
Log.w(TAG, "play() failed: ${e.message}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var offset = 0
|
var offset = 0
|
||||||
while (offset < data.size && !writerShouldStop) {
|
while (offset < data.size && !writerShouldStop) {
|
||||||
val written = t.write(data, offset, data.size - offset)
|
val written = t.write(data, offset, data.size - offset)
|
||||||
if (written <= 0) break
|
if (written <= 0) break
|
||||||
offset += written
|
offset += written
|
||||||
}
|
}
|
||||||
|
bytesBuffered += data.size
|
||||||
}
|
}
|
||||||
} catch (e: Exception) {
|
} catch (e: Exception) {
|
||||||
Log.w(TAG, "Writer-Thread Fehler: ${e.message}")
|
Log.w(TAG, "Writer-Thread Fehler: ${e.message}")
|
||||||
} finally {
|
} finally {
|
||||||
|
// Warten bis alle geschriebenen Samples tatsaechlich abgespielt sind,
|
||||||
|
// sonst cuttet t.release() die letzten Sekunden ab.
|
||||||
|
try {
|
||||||
|
val totalFrames = (bytesBuffered / streamBytesPerFrame).toInt()
|
||||||
|
var lastPos = -1
|
||||||
|
var stalledCount = 0
|
||||||
|
while (!writerShouldStop) {
|
||||||
|
val pos = t.playbackHeadPosition
|
||||||
|
if (pos >= totalFrames) break
|
||||||
|
// Safety: wenn Position 2s nicht mehr vorwaerts → AudioTrack hing
|
||||||
|
if (pos == lastPos) {
|
||||||
|
stalledCount++
|
||||||
|
if (stalledCount > 40) {
|
||||||
|
Log.w(TAG, "playback stalled at $pos/$totalFrames — give up")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
stalledCount = 0
|
||||||
|
lastPos = pos
|
||||||
|
}
|
||||||
|
Thread.sleep(50)
|
||||||
|
}
|
||||||
|
Log.i(TAG, "Playback fertig: frames=$totalFrames pos=${t.playbackHeadPosition}")
|
||||||
|
} catch (_: Exception) {}
|
||||||
try { t.stop() } catch (_: Exception) {}
|
try { t.stop() } catch (_: Exception) {}
|
||||||
try { t.release() } catch (_: Exception) {}
|
try { t.release() } catch (_: Exception) {}
|
||||||
}
|
}
|
||||||
}, "PcmStreamWriter").apply { start() }
|
}, "PcmStreamWriter").apply { start() }
|
||||||
|
|
||||||
Log.i(TAG, "Stream gestartet: ${sampleRate}Hz ch=$channels buf=${bufferSize}B")
|
Log.i(TAG, "Stream gestartet: ${sampleRate}Hz ch=$channels buf=${bufferSize}B preroll=${prerollBytes}B")
|
||||||
promise.resolve(true)
|
promise.resolve(true)
|
||||||
} catch (e: Exception) {
|
} catch (e: Exception) {
|
||||||
Log.e(TAG, "start fehlgeschlagen", e)
|
Log.e(TAG, "start fehlgeschlagen", e)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "aria-cockpit",
|
"name": "aria-cockpit",
|
||||||
"version": "0.0.4.4",
|
"version": "0.0.4.7",
|
||||||
"private": true,
|
"private": true,
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"android": "react-native run-android",
|
"android": "react-native run-android",
|
||||||
|
|||||||
@@ -150,6 +150,15 @@ def _small_range_to_words(m):
|
|||||||
return f"{_num_to_words_de(a)} bis {_num_to_words_de(b)}"
|
return f"{_num_to_words_de(a)} bis {_num_to_words_de(b)}"
|
||||||
|
|
||||||
|
|
||||||
|
def _decimal_to_words(m):
|
||||||
|
"""'0.1' / '0,1' → 'null komma eins', '1,25' → 'eins komma zwei fuenf'."""
|
||||||
|
int_part = int(m.group(1))
|
||||||
|
dec_part = m.group(2)
|
||||||
|
int_word = _num_to_words_de(int_part) if 0 <= int_part <= 59 else str(int_part)
|
||||||
|
dec_words = " ".join(_num_to_words_de(int(d)) for d in dec_part)
|
||||||
|
return f"{int_word} komma {dec_words}"
|
||||||
|
|
||||||
|
|
||||||
_UNIT_WORDS = [
|
_UNIT_WORDS = [
|
||||||
(r'\bTB\b', 'Terabyte'),
|
(r'\bTB\b', 'Terabyte'),
|
||||||
(r'\bGB\b', 'Gigabyte'),
|
(r'\bGB\b', 'Gigabyte'),
|
||||||
@@ -236,6 +245,11 @@ def clean_text_for_tts(text: str) -> str:
|
|||||||
# Kleine Zahlen-Bereiche ohne "Uhr": "5-6" → "fuenf bis sechs"
|
# Kleine Zahlen-Bereiche ohne "Uhr": "5-6" → "fuenf bis sechs"
|
||||||
t = _re_tts.sub(r'\b(\d{1,2})\s*[-–]\s*(\d{1,2})\b', _small_range_to_words, t)
|
t = _re_tts.sub(r'\b(\d{1,2})\s*[-–]\s*(\d{1,2})\b', _small_range_to_words, t)
|
||||||
|
|
||||||
|
# Dezimalzahlen: "0.1" / "0,5" / "1,25" → "null komma eins" / "null komma fuenf" / ...
|
||||||
|
# Muss vor "Zahl+Einheit" laufen, sonst frisst die Unit-Regel den Nachkommaanteil.
|
||||||
|
# Lookahead verhindert Match auf IP-artigen Strings wie 192.168.1.1.
|
||||||
|
t = _re_tts.sub(r'\b(\d+)[.,](\d+)(?![.,\d])', _decimal_to_words, t)
|
||||||
|
|
||||||
# Zahlen + Einheit: "22GB" → "22 Gigabyte" (Leerzeichen einfuegen)
|
# Zahlen + Einheit: "22GB" → "22 Gigabyte" (Leerzeichen einfuegen)
|
||||||
t = _re_tts.sub(r'(\d+)([A-Za-z]{1,4})\b', r'\1 \2', t)
|
t = _re_tts.sub(r'(\d+)([A-Za-z]{1,4})\b', r'\1 \2', t)
|
||||||
|
|
||||||
|
|||||||
+5
-1
@@ -216,11 +216,15 @@ function streamXTTSAsPCM(text, language, speakerWav, onPcmChunk) {
|
|||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
// Wichtig: speaker_wav MUSS als Query-Key dabei sein (Pydantic required) —
|
// Wichtig: speaker_wav MUSS als Query-Key dabei sein (Pydantic required) —
|
||||||
// auch bei default-voice mit leerem Wert. Sonst gibt's HTTP 422.
|
// auch bei default-voice mit leerem Wert. Sonst gibt's HTTP 422.
|
||||||
|
// stream_chunk_size=100: Kompromiss zwischen first-audio-latency und
|
||||||
|
// gap-risk. Bei RTX 3060 (RTF 1.48) ~3s bis erster Audio, Chunks gross
|
||||||
|
// genug dass der AudioTrack-Buffer (128KB ≈ 2.7s) zwischen Chunks nicht
|
||||||
|
// leerlauft.
|
||||||
const qs = new URLSearchParams();
|
const qs = new URLSearchParams();
|
||||||
qs.set("text", text);
|
qs.set("text", text);
|
||||||
qs.set("language", language || "de");
|
qs.set("language", language || "de");
|
||||||
qs.set("speaker_wav", speakerWav || "");
|
qs.set("speaker_wav", speakerWav || "");
|
||||||
qs.set("stream_chunk_size", "40");
|
qs.set("stream_chunk_size", "100");
|
||||||
|
|
||||||
const url = new URL(XTTS_API_URL);
|
const url = new URL(XTTS_API_URL);
|
||||||
const fullPath = `/tts_stream?${qs.toString()}`;
|
const fullPath = `/tts_stream?${qs.toString()}`;
|
||||||
|
|||||||
+6
-21
@@ -33,27 +33,12 @@ services:
|
|||||||
- ./voices:/voices # Custom Voice Samples
|
- ./voices:/voices # Custom Voice Samples
|
||||||
environment:
|
environment:
|
||||||
- COQUI_TOS_AGREED=1
|
- COQUI_TOS_AGREED=1
|
||||||
# Local-Modus: Modell bleibt dauerhaft im GPU-VRAM (~2GB). Vorteile:
|
# Local-Modus statt default "apiManual": Modell bleibt im GPU-VRAM,
|
||||||
# - Render startet sofort (kein reload pro Request)
|
# Render startet sofort, /tts_stream funktioniert.
|
||||||
# - /tts_stream funktioniert → echtes Streaming mit ~500ms time-to-first-audio
|
# Default-CMD des Images liest diese ENV: -ms ${MODEL_SOURCE:-"apiManual"}
|
||||||
# Ohne diesen command: apiManual-Modus, jede Anfrage laedt Modell neu, kein Streaming.
|
- MODEL_SOURCE=local
|
||||||
# Der NVIDIA-Entrypoint erwartet Python als ausfuehrbares Command, nicht nur Flags.
|
# Speaker-Folder auf unsere gemounteten voices zeigen lassen
|
||||||
command:
|
- EXAMPLE_FOLDER=/voices
|
||||||
- python
|
|
||||||
- -m
|
|
||||||
- xtts_api_server
|
|
||||||
- -hs
|
|
||||||
- "0.0.0.0"
|
|
||||||
- -p
|
|
||||||
- "8020"
|
|
||||||
- -ms
|
|
||||||
- local
|
|
||||||
- -o
|
|
||||||
- /app/output
|
|
||||||
- -mf
|
|
||||||
- /app/xtts_models
|
|
||||||
- -sf
|
|
||||||
- /voices
|
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
# ─── XTTS Bridge (verbindet zu RVS) ───────────
|
# ─── XTTS Bridge (verbindet zu RVS) ───────────
|
||||||
|
|||||||
Reference in New Issue
Block a user