diff --git a/android/src/screens/SettingsScreen.tsx b/android/src/screens/SettingsScreen.tsx
index c661448..d5bd7e3 100644
--- a/android/src/screens/SettingsScreen.tsx
+++ b/android/src/screens/SettingsScreen.tsx
@@ -807,23 +807,13 @@ const SettingsScreen: React.FC = () => {
           <View style={{marginTop: 20}}>
             <Text style={styles.toggleLabel}>Stimme (geraetelokal)</Text>
             <Text style={styles.toggleHint}>
-              Eigene Wahl fuer dieses Geraet. Ohne Auswahl gilt der Diagnostic-Default.
+              Eine geklonte Stimme auswaehlen. F5-TTS braucht zwingend eine Referenz —
+              ohne Auswahl gilt die in Diagnostic gewaehlte globale Stimme.
             </Text>
 
-            {/* Default-Option */}
-            <TouchableOpacity
-              style={[styles.voiceRow, xttsVoice === '' && styles.voiceRowActive]}
-              onPress={() => selectVoice('')}
-            >
-              <Text style={[styles.voiceRowName, xttsVoice === '' && styles.voiceRowNameActive]}>
-                Standard (Diagnostic-Default)
-              </Text>
-              {xttsVoice === '' && <Text style={styles.voiceRowCheck}>{'\u2713'}</Text>}
-            </TouchableOpacity>
-
             {availableVoices.length === 0 ? (
               <Text style={[styles.toggleHint, {marginTop: 8, textAlign: 'center'}]}>
-                Keine eigenen Stimmen auf dem XTTS-Server.
+                Keine geklonten Stimmen vorhanden — unten "Eigene Stimme aufnehmen".
               </Text>
             ) : (
               availableVoices.map(v => (
diff --git a/diagnostic/index.html b/diagnostic/index.html
index f98e2a9..9d32de2 100644
--- a/diagnostic/index.html
+++ b/diagnostic/index.html
@@ -437,11 +437,11 @@
           <label class="toggle"><input type="checkbox" id="diag-tts-enabled" checked onchange="sendVoiceConfig()"><span class="slider"></span></label>
         </div>
 
-        <!-- XTTS Stimme -->
+        <!-- F5-TTS Stimme (zwingend eine Voice waehlen — F5-TTS braucht eine Referenz) -->
         <div style="display:flex;align-items:center;gap:12px;margin-bottom:6px;">
-          <label style="color:#8888AA;font-size:12px;">XTTS Stimme:</label>
+          <label style="color:#8888AA;font-size:12px;">F5-TTS Stimme:</label>
           <select id="diag-xtts-voice" onchange="sendVoiceConfig()" style="background:#1E1E2E;color:#fff;border:1px solid #2A2A3E;border-radius:6px;padding:6px 10px;font-size:13px;">
-            <option value="">Standard (XTTS Default)</option>
+            <option value="" disabled>(keine Stimme gewaehlt)</option>
           </select>
           <button class="btn secondary" onclick="loadXTTSVoices()" style="padding:4px 10px;font-size:11px;">Laden</button>
         </div>
diff --git a/xtts/f5tts/bridge.py b/xtts/f5tts/bridge.py
index 769932d..3330cb3 100644
--- a/xtts/f5tts/bridge.py
+++ b/xtts/f5tts/bridge.py
@@ -66,6 +66,11 @@ VOICES_DIR = Path(os.getenv("VOICES_DIR", "/voices"))
 PCM_CHUNK_BYTES = 8192   # ~170ms @ 24kHz mono s16
 TARGET_SR = 24000        # F5-TTS native
 
+# Wird in einer Uebergangsphase als "ungueltige Referenz" erkannt (alte voices,
+# die hochgeladen wurden bevor die whisper-bridge online war). Bei Erkennung
+# loeschen wir die .txt und ziehen den echten Text nach.
+_LEGACY_PLACEHOLDER_REF = "Das ist ein Referenz Audio."
+
 # ── Lazy F5-TTS Loader ──────────────────────────────────────
 
 _F5TTS_cls = None
@@ -279,13 +284,24 @@ async def _do_tts(ws, runner: F5Runner, text: str, voice: str,
                   request_id: str, message_id: str, language: str) -> None:
     t0 = time.time()
     ref_wav_path, ref_txt_path = voice_paths(voice) if voice else (None, None)
+
+    # Legacy-Platzhalter erkennen → behandeln als "kein txt" und neu transkribieren
+    if voice and ref_txt_path and ref_txt_path.exists():
+        try:
+            existing = ref_txt_path.read_text(encoding="utf-8").strip()
+            if existing == _LEGACY_PLACEHOLDER_REF or not existing:
+                logger.info("Voice '%s' hat Legacy-Platzhalter → loesche, transkribiere neu", voice)
+                ref_txt_path.unlink()
+        except Exception:
+            pass
+
     has_custom = bool(voice and ref_wav_path and ref_wav_path.exists() and ref_txt_path.exists())
     if voice and not has_custom:
         # Wenn nur WAV da ist aber kein txt → on-the-fly transkribieren
         if ref_wav_path and ref_wav_path.exists() and (not ref_txt_path or not ref_txt_path.exists()):
             logger.info("Voice '%s' hat kein txt — transkribiere on-the-fly", voice)
             text_ref = await request_transcription(ws, ref_wav_path, language)
-            if text_ref:
+            if text_ref and text_ref.strip():
                 try:
                     ref_txt_path.write_text(text_ref.strip(), encoding="utf-8")
                     has_custom = True
@@ -417,14 +433,20 @@ async def handle_voice_upload(ws, payload: dict) -> None:
         # Transkription ueber whisper-bridge anfragen
         logger.info("Transkribiere '%s' via whisper-bridge...", name)
         text = await request_transcription(ws, wav_path, language="de")
-        if not text:
-            logger.warning("Transkription fehlgeschlagen — speichere Platzhalter-Text")
-            text = "Das ist ein Referenz Audio."
-        txt_path.write_text(text.strip(), encoding="utf-8")
-        logger.info("Voice '%s' komplett (txt: %s)", name, text[:80])
+        if text and text.strip():
+            txt_path.write_text(text.strip(), encoding="utf-8")
+            logger.info("Voice '%s' komplett (txt: %s)", name, text[:80])
+            ref_text_for_response = text.strip()
+        else:
+            # KEIN Platzhalter mehr schreiben! Beim ersten echten TTS-Use wird
+            # on-the-fly nachtranskribiert. Wenn die whisper-bridge dann online
+            # ist, klappt's — sonst koennte der User die .txt manuell anlegen.
+            logger.warning("Voice '%s': Transkription fehlgeschlagen — .txt bleibt leer, "
+                           "wird on-the-fly bei erstem Render nachgezogen", name)
+            ref_text_for_response = ""
 
         await _send(ws, "xtts_voice_saved", {
-            "name": name, "size": int(size_kb * 1024), "refText": text.strip(),
+            "name": name, "size": int(size_kb * 1024), "refText": ref_text_for_response,
         })
         # Liste aktualisieren
         await handle_list_voices(ws)