Compare commits
5 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
8761d1a1b7 | |
|
|
abc5b971f4 | |
|
|
b588dd7e3b | |
|
|
309df9d851 | |
|
|
f2e643d1fb |
|
|
@ -406,10 +406,11 @@ mit ONNX Runtime — kein API-Key, kein Cloud-Roundtrip, kein Cent Lizenzgebuehr
|
|||
und das Audio verlaesst das Geraet nie.
|
||||
|
||||
**Mitgelieferte Wake-Words** (ONNX-Dateien in `android/android/app/src/main/assets/openwakeword/`):
|
||||
- `Hey Jarvis` (Default)
|
||||
- `Alexa`
|
||||
- `Hey Mycroft`
|
||||
- `Hey Rhasspy`
|
||||
- `Hey Jarvis` (Default, openWakeWord-Original)
|
||||
- `Computer` (Star-Trek-Style, Community-Modell)
|
||||
- `Alexa`, `Hey Mycroft`, `Hey Rhasspy` (openWakeWord-Originale)
|
||||
|
||||
Community-Modelle stammen aus [fwartner/home-assistant-wakewords-collection](https://github.com/fwartner/home-assistant-wakewords-collection).
|
||||
|
||||
**Bedienung:**
|
||||
- App → **Einstellungen** → **Wake-Word** → gewuenschtes Keyword waehlen → **Speichern + Aktivieren**
|
||||
|
|
|
|||
|
|
@ -79,8 +79,8 @@ android {
|
|||
applicationId "com.ariacockpit"
|
||||
minSdkVersion rootProject.ext.minSdkVersion
|
||||
targetSdkVersion rootProject.ext.targetSdkVersion
|
||||
versionCode 607
|
||||
versionName "0.0.6.7"
|
||||
versionCode 609
|
||||
versionName "0.0.6.9"
|
||||
// Fallback fuer Libraries mit Product Flavors
|
||||
missingDimensionStrategy 'react-native-camera', 'general'
|
||||
}
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -42,8 +42,8 @@ class OpenWakeWordModule(reactContext: ReactApplicationContext) : ReactContextBa
|
|||
private const val MEL_FRAMES_PER_EMBEDDING = 76 // Embedding-Fenster
|
||||
private const val EMBEDDING_STRIDE = 8 // Slide um 8 Mel-Frames
|
||||
private const val EMBEDDING_DIM = 96
|
||||
private const val WW_INPUT_FRAMES = 16 // 16 Embeddings = ~1.28s
|
||||
private const val MEL_BINS = 32
|
||||
private const val DEFAULT_WW_INPUT_FRAMES = 16 // Fallback wenn Modell-Metadata fehlt
|
||||
}
|
||||
|
||||
private val env: OrtEnvironment = OrtEnvironment.getEnvironment()
|
||||
|
|
@ -54,6 +54,10 @@ class OpenWakeWordModule(reactContext: ReactApplicationContext) : ReactContextBa
|
|||
private var melInputName: String = "input"
|
||||
private var embInputName: String = "input_1"
|
||||
private var wwInputName: String = "input"
|
||||
// Anzahl Embedding-Frames die der Wake-Word-Klassifikator pro Inferenz erwartet —
|
||||
// hey_jarvis hat 16, andere Community-Modelle koennen abweichen (z.B. 28).
|
||||
// Wird beim init() aus den Modell-Metadaten gelesen.
|
||||
private var wwInputFrames: Int = DEFAULT_WW_INPUT_FRAMES
|
||||
|
||||
// Konfiguration
|
||||
private var threshold: Float = 0.5f
|
||||
|
|
@ -100,7 +104,13 @@ class OpenWakeWordModule(reactContext: ReactApplicationContext) : ReactContextBa
|
|||
embInputName = embSession!!.inputNames.first()
|
||||
wwInputName = wwSession!!.inputNames.first()
|
||||
|
||||
Log.i(TAG, "Init OK: model=$modelName threshold=$threshold patience=$patience " +
|
||||
// WW-Input-Frame-Count aus dem Modell lesen — variiert pro Keyword.
|
||||
// Erwartete Form: (1, N, 96), N steht in der Modell-Metadaten.
|
||||
val wwInputInfo = wwSession!!.inputInfo[wwInputName]
|
||||
val wwShape = (wwInputInfo?.info as? ai.onnxruntime.TensorInfo)?.shape
|
||||
wwInputFrames = wwShape?.getOrNull(1)?.toInt()?.takeIf { it > 0 } ?: DEFAULT_WW_INPUT_FRAMES
|
||||
|
||||
Log.i(TAG, "Init OK: model=$modelName wwFrames=$wwInputFrames threshold=$threshold patience=$patience " +
|
||||
"debounce=${debounceMs}ms (inputs: mel=$melInputName emb=$embInputName ww=$wwInputName)")
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
|
|
@ -299,11 +309,12 @@ class OpenWakeWordModule(reactContext: ReactApplicationContext) : ReactContextBa
|
|||
val embRes = embSession!!.run(mapOf(embInputName to embIn))
|
||||
val embOut = embRes.get(0).value
|
||||
embIn.close()
|
||||
// Erwartete Output-Form: (1, 96) → Array<FloatArray>
|
||||
// Erwartete Output-Form: (1, 1, 1, 96) — rank-4, NICHT (1, 96).
|
||||
// Die Google-Embedding-Pipeline behaelt extra Dimensionen.
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val embArr = embOut as Array<FloatArray>
|
||||
embBuffer.addLast(embArr[0].copyOf())
|
||||
while (embBuffer.size > WW_INPUT_FRAMES) embBuffer.removeFirst()
|
||||
val embArr = embOut as Array<Array<Array<FloatArray>>>
|
||||
embBuffer.addLast(embArr[0][0][0].copyOf())
|
||||
while (embBuffer.size > wwInputFrames) embBuffer.removeFirst()
|
||||
embRes.close()
|
||||
|
||||
melProcessedIdx += EMBEDDING_STRIDE
|
||||
|
|
@ -319,9 +330,10 @@ class OpenWakeWordModule(reactContext: ReactApplicationContext) : ReactContextBa
|
|||
}
|
||||
|
||||
// 3) Klassifikation — sobald wir 16 Embeddings haben
|
||||
if (embBuffer.size < WW_INPUT_FRAMES) return
|
||||
val flatEmb = FloatArray(WW_INPUT_FRAMES * EMBEDDING_DIM)
|
||||
if (embBuffer.size < wwInputFrames) return
|
||||
val flatEmb = FloatArray(wwInputFrames * EMBEDDING_DIM)
|
||||
var p = 0
|
||||
// Letzte wwInputFrames Embeddings nehmen (embBuffer ist auf wwInputFrames begrenzt)
|
||||
for (e in embBuffer) {
|
||||
System.arraycopy(e, 0, flatEmb, p, EMBEDDING_DIM)
|
||||
p += EMBEDDING_DIM
|
||||
|
|
@ -329,7 +341,7 @@ class OpenWakeWordModule(reactContext: ReactApplicationContext) : ReactContextBa
|
|||
val wwIn = OnnxTensor.createTensor(
|
||||
env,
|
||||
FloatBuffer.wrap(flatEmb),
|
||||
longArrayOf(1L, WW_INPUT_FRAMES.toLong(), EMBEDDING_DIM.toLong()),
|
||||
longArrayOf(1L, wwInputFrames.toLong(), EMBEDDING_DIM.toLong()),
|
||||
)
|
||||
val wwRes = wwSession!!.run(mapOf(wwInputName to wwIn))
|
||||
val wwOut = wwRes.get(0).value
|
||||
|
|
|
|||
|
|
@ -137,6 +137,17 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
|||
Log.w(TAG, "play() sofort failed: ${e.message}")
|
||||
}
|
||||
}
|
||||
// Idle-Cutoff: wenn endRequested NICHT kam aber 30s nichts mehr
|
||||
// reinkommt, brechen wir ab (Bridge-Crash, verlorener final).
|
||||
var idleMs = 0L
|
||||
val maxIdleMs = 30_000L
|
||||
// Zielpufferfuellung — unter diesem Wasserstand fuettern wir
|
||||
// Stille rein damit AudioTrack nicht underrunt waehrend die
|
||||
// Bridge den naechsten Satz rendert. Spotify/YouTube reagieren
|
||||
// sonst mit eigenmaechtiger Wiederaufnahme nach ~10s Stille.
|
||||
val underrunGuardFrames = sampleRate / 10 // ~100ms
|
||||
val silenceFillFrames = sampleRate / 20 // ~50ms pro Refill
|
||||
|
||||
mainLoop@ while (!writerShouldStop) {
|
||||
val data = queue.poll(50, java.util.concurrent.TimeUnit.MILLISECONDS)
|
||||
if (data == null) {
|
||||
|
|
@ -153,8 +164,33 @@ class PcmStreamPlayerModule(reactContext: ReactApplicationContext) : ReactContex
|
|||
}
|
||||
break@mainLoop
|
||||
}
|
||||
// Underrun-Schutz: Stille reinfuettern wenn der AudioTrack-
|
||||
// Puffer leerzulaufen droht. Spotify resumed sonst nach
|
||||
// ~10s Pause auf eigene Faust, obwohl wir den Fokus halten.
|
||||
if (playbackStarted) {
|
||||
val framesWritten = bytesBuffered / streamBytesPerFrame
|
||||
val framesPlayed = t.playbackHeadPosition.toLong()
|
||||
val framesInBuffer = framesWritten - framesPlayed
|
||||
if (framesInBuffer < underrunGuardFrames) {
|
||||
val fillBytes = silenceFillFrames * streamBytesPerFrame
|
||||
val silence = ByteArray(fillBytes)
|
||||
var silOff = 0
|
||||
while (silOff < silence.size && !writerShouldStop) {
|
||||
val w = t.write(silence, silOff, silence.size - silOff)
|
||||
if (w <= 0) break
|
||||
silOff += w
|
||||
}
|
||||
bytesBuffered += silence.size
|
||||
}
|
||||
}
|
||||
idleMs += 50L
|
||||
if (idleMs >= maxIdleMs) {
|
||||
Log.w(TAG, "Idle-Cutoff: ${maxIdleMs}ms keine Daten — Stream wird beendet")
|
||||
break@mainLoop
|
||||
}
|
||||
continue@mainLoop
|
||||
}
|
||||
idleMs = 0L
|
||||
|
||||
// Pre-Roll Check: play() erst wenn genug gepuffert
|
||||
if (!playbackStarted && bytesBuffered + data.size >= prerollBytes) {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "aria-cockpit",
|
||||
"version": "0.0.6.7",
|
||||
"version": "0.0.6.9",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"android": "react-native run-android",
|
||||
|
|
|
|||
|
|
@ -619,6 +619,8 @@ const ChatScreen: React.FC = () => {
|
|||
base64: result.base64,
|
||||
durationMs: result.durationMs,
|
||||
mimeType: result.mimeType,
|
||||
voice: localXttsVoiceRef.current,
|
||||
speed: ttsSpeedRef.current,
|
||||
...(location && { location }),
|
||||
});
|
||||
}, [getCurrentLocation]);
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ export const WAKE_KEYWORD_STORAGE = 'aria_wake_keyword';
|
|||
* werden — Diagnostic-Upload ist Phase 2. */
|
||||
export const WAKE_KEYWORDS = [
|
||||
'hey_jarvis',
|
||||
'computer',
|
||||
'alexa',
|
||||
'hey_mycroft',
|
||||
'hey_rhasspy',
|
||||
|
|
@ -46,6 +47,7 @@ export const DEFAULT_KEYWORD: WakeKeyword = 'hey_jarvis';
|
|||
/** Hilfs-Mapping fuer die Anzeige im UI. */
|
||||
export const KEYWORD_LABELS: Record<WakeKeyword, string> = {
|
||||
hey_jarvis: 'Hey Jarvis',
|
||||
computer: 'Computer',
|
||||
alexa: 'Alexa',
|
||||
hey_mycroft: 'Hey Mycroft',
|
||||
hey_rhasspy: 'Hey Rhasspy',
|
||||
|
|
|
|||
|
|
@ -907,18 +907,13 @@ class ARIABridge:
|
|||
logger.info("[core] TTS unterdrueckt (Modus: %s)", self.current_mode.config.name)
|
||||
return
|
||||
|
||||
# Voice bestimmen: App-Override fuer diesen Request > globale Default-Voice
|
||||
# Voice bestimmen: App-Override (gesetzt durch letzten chat-Event) > globale
|
||||
# Default-Voice. Der Override wird NICHT pro Antwort verbraucht — sonst nutzt
|
||||
# eine Multi-Turn-Antwort von ARIA (Tool-Use + finale Antwort) ab dem zweiten
|
||||
# TTS-Call wieder die alte Default-Stimme. Der Override bleibt gueltig bis
|
||||
# zum naechsten chat-Event, wo er entweder ueberschrieben oder geloescht wird.
|
||||
xtts_voice = self._next_voice_override or getattr(self, 'xtts_voice', '')
|
||||
# Override verbrauchen (gilt nur fuer genau diese naechste Antwort)
|
||||
if self._next_voice_override:
|
||||
logger.info("[core] Nutze Voice-Override: %s", self._next_voice_override)
|
||||
self._next_voice_override = None
|
||||
|
||||
# Speed ebenfalls aus App-Override nehmen (fallback 1.0)
|
||||
xtts_speed = self._next_speed_override or 1.0
|
||||
if self._next_speed_override:
|
||||
logger.info("[core] Nutze Speed-Override: %.2fx", self._next_speed_override)
|
||||
self._next_speed_override = None
|
||||
|
||||
tts_text = tts_text_preview or text
|
||||
if not tts_text:
|
||||
|
|
@ -1169,18 +1164,22 @@ class ARIABridge:
|
|||
if sender in ("aria", "stt"):
|
||||
return
|
||||
text = payload.get("text", "")
|
||||
# Voice-Override fuer die naechste ARIA-Antwort merken
|
||||
voice_override = payload.get("voice", "")
|
||||
if voice_override:
|
||||
self._next_voice_override = voice_override
|
||||
logger.info("[rvs] Voice-Override fuer naechste Antwort: %s", voice_override)
|
||||
# Voice-Override fuer Folgenachrichten setzen — gilt bis zum naechsten
|
||||
# chat-Event. Leerer String "" = explizit Default-Voice (override loeschen).
|
||||
# Field nicht gesendet = vorherigen Override unveraendert lassen (z.B. wenn
|
||||
# cancel_request oder anderer Service die App umgeht).
|
||||
if "voice" in payload:
|
||||
voice_override = payload.get("voice", "") or ""
|
||||
self._next_voice_override = voice_override or None
|
||||
logger.info("[rvs] Voice fuer Antworten: %s",
|
||||
self._next_voice_override or "(Default)")
|
||||
# Speed-Override (TTS-Wiedergabegeschwindigkeit, pro Geraet)
|
||||
try:
|
||||
speed = float(payload.get("speed", 0) or 0)
|
||||
if 0.1 <= speed <= 5.0:
|
||||
self._next_speed_override = speed
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
if "speed" in payload:
|
||||
try:
|
||||
speed = float(payload.get("speed", 0) or 0)
|
||||
self._next_speed_override = speed if 0.1 <= speed <= 5.0 else None
|
||||
except (TypeError, ValueError):
|
||||
self._next_speed_override = None
|
||||
if text:
|
||||
logger.info("[rvs] App-Chat: '%s'", text[:80])
|
||||
await self.send_to_core(text, source="app")
|
||||
|
|
@ -1444,17 +1443,18 @@ class ARIABridge:
|
|||
if not audio_b64:
|
||||
logger.warning("[rvs] Audio ohne Daten empfangen")
|
||||
return
|
||||
# Voice-Override fuer die kommende ARIA-Antwort (App-lokal gewaehlt)
|
||||
voice_override = payload.get("voice", "")
|
||||
if voice_override:
|
||||
self._next_voice_override = voice_override
|
||||
logger.info("[rvs] Voice-Override (via Audio): %s", voice_override)
|
||||
try:
|
||||
speed = float(payload.get("speed", 0) or 0)
|
||||
if 0.1 <= speed <= 5.0:
|
||||
self._next_speed_override = speed
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
# Voice-Override fuer Folgenachrichten — gleiche Semantik wie beim chat-Event.
|
||||
if "voice" in payload:
|
||||
voice_override = payload.get("voice", "") or ""
|
||||
self._next_voice_override = voice_override or None
|
||||
logger.info("[rvs] Voice fuer Antworten (via Audio): %s",
|
||||
self._next_voice_override or "(Default)")
|
||||
if "speed" in payload:
|
||||
try:
|
||||
speed = float(payload.get("speed", 0) or 0)
|
||||
self._next_speed_override = speed if 0.1 <= speed <= 5.0 else None
|
||||
except (TypeError, ValueError):
|
||||
self._next_speed_override = None
|
||||
logger.info("[rvs] Audio empfangen: %s, %dms, %dKB",
|
||||
mime_type, duration_ms, len(audio_b64) // 1365)
|
||||
asyncio.create_task(self._process_app_audio(audio_b64, mime_type))
|
||||
|
|
|
|||
Loading…
Reference in New Issue