From 4cbe184faaf532b3ebde1b1e9556a92571baaf67 Mon Sep 17 00:00:00 2001
From: duffyduck <info@hacker-net.de>
Date: Wed, 22 Apr 2026 17:38:53 +0200
Subject: [PATCH] feat: XTTS auf local-Mode (dauerhaft im VRAM) + /tts_stream +
 Fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root cause der langen Render-Zeiten und /tts_stream 400-Errors:
daswer123 default ist apiManual/api-Mode — Modell wird pro Request
gefetched/reloaded, Streaming unsupported.

Fix in xtts/docker-compose.yml:
  command: ['--listen', '-p', '8020', '-t', 'http://0.0.0.0:8020',
            '-ms', 'local',
            '-o', '/app/output', '-mf', '/app/xtts_models', '-sf', '/voices']

-ms local:
  - Modell dauerhaft im GPU-VRAM (~2GB, passt auf RTX 3060 mit 12GB)
  - Render startet sofort, kein per-Request-Load mehr
  - /tts_stream unterstuetzt → echtes progressive streaming
  - time-to-first-audio ~500ms statt 8-11s

xtts/bridge.js:
  /tts_stream primary, /tts_to_audio/ als Fallback wenn Stream fehlt.
  Robust: wenn User spaeter den Mode wieder umstellt, fallback greift.

Erste Ladung nach dem Wechsel dauert einmalig laenger (Modell ins VRAM
laden). Danach: schnell + streaming.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 xtts/bridge.js          | 28 ++++++++++++++++++----------
 xtts/docker-compose.yml |  9 +++++++++
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/xtts/bridge.js b/xtts/bridge.js
index 4a8ab0b..e5ad399 100644
--- a/xtts/bridge.js
+++ b/xtts/bridge.js
@@ -157,16 +157,24 @@ async function _runTTSRequest(payload) {
       });
     };
 
-    // /tts_stream funktioniert nur bei XTTS im local-Modus. daswer123 im
-    // Remote-API-Modus antwortet mit 400 'HTTP Streaming is only supported
-    // for local models'. Wir nutzen stabil /tts_to_audio/ (batch render,
-    // dann Response-Body chunkweise als PCM).
-    await streamXTTSBatch(
-      cleanText,
-      language || "de",
-      hasCustomVoice ? voiceSample : null,
-      onChunk,
-    );
+    // /tts_stream fuer echtes Streaming (funktioniert im XTTS local-Mode).
+    // Wenn Server im apiManual/api-Mode laeuft: 400 → Fallback auf /tts_to_audio/.
+    try {
+      await streamXTTSAsPCM(
+        cleanText,
+        language || "de",
+        hasCustomVoice ? voiceSample : null,
+        onChunk,
+      );
+    } catch (streamErr) {
+      log(`/tts_stream fehlgeschlagen (${streamErr.message.slice(0, 100)}) — Fallback /tts_to_audio/`);
+      await streamXTTSBatch(
+        cleanText,
+        language || "de",
+        hasCustomVoice ? voiceSample : null,
+        onChunk,
+      );
+    }
 
     // Am Ende: final-Flag damit App weiss "fertig" und Cache geschrieben werden kann
     if (pcmMeta) {
diff --git a/xtts/docker-compose.yml b/xtts/docker-compose.yml
index 7157a98..0e25fd9 100644
--- a/xtts/docker-compose.yml
+++ b/xtts/docker-compose.yml
@@ -33,6 +33,15 @@ services:
       - ./voices:/voices                        # Custom Voice Samples
     environment:
       - COQUI_TOS_AGREED=1
+    # Local-Modus: Modell bleibt dauerhaft im GPU-VRAM (~2GB). Vorteile:
+    #   - Render startet sofort (kein reload pro Request)
+    #   - /tts_stream funktioniert → echtes Streaming mit ~500ms time-to-first-audio
+    # Ohne diesen command: apiManual-Modus, jede Anfrage lädt Modell neu, kein Streaming.
+    command: ["--listen", "-p", "8020", "-t", "http://0.0.0.0:8020",
+              "-ms", "local",
+              "-o", "/app/output",
+              "-mf", "/app/xtts_models",
+              "-sf", "/voices"]
     restart: unless-stopped
 
   # ─── XTTS Bridge (verbindet zu RVS) ───────────