feat(app): Wake-Word komplett on-device via openWakeWord (ONNX)

Picovoice/Porcupine raus — neuer Stack ist openWakeWord (Apache 2.0, on-device, ONNX Runtime). Kein API-Key, keine Lizenzgebuehren, Audio verlaesst das Geraet nicht. Eigene Wake-Words sind via openWakeWord- Notebook gratis trainierbar. Pipeline (alles im OpenWakeWordModule.kt): 1. AudioRecord 16kHz mono int16 in 1280-Sample-Chunks (80ms) 2. melspectrogram.onnx → 32-mel Frames (mel/10 + 2 wie in Python) 3. embedding_model.onnx, 76-Frame Sliding Window (stride 8) → 96-dim 4. hey_jarvis.onnx (oder anderes Keyword) auf letzten 16 Embeddings 5. Sigmoid-Score, threshold/patience/debounce-Filter 6. RN-Event "WakeWordDetected" raus Mitgelieferte Modelle in assets/openwakeword/: hey_jarvis (Default), alexa, hey_mycroft, hey_rhasspy. Externe Service-API (start/stop/ configure/onWakeWord/...) bleibt identisch — ChatScreen unveraendert. build.gradle: com.microsoft.onnxruntime:onnxruntime-android:1.17.1 package.json: @picovoice/porcupine-react-native + voice-processor raus SettingsScreen: AccessKey-Feld weg, neue Keyword-Liste mit Labels README: Wake-Word-Sektion komplett umgeschrieben (kein Picovoice mehr) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 12:56:33 +02:00
parent a4d3449e3a
commit 55cfb752a2
14 changed files with 532 additions and 196 deletions
@@ -21,6 +21,7 @@ class MainApplication : Application(), ReactApplication {
              add(ApkInstallerPackage())
              add(AudioFocusPackage())
              add(PcmStreamPlayerPackage())
+              add(OpenWakeWordPackage())
            }

        override fun getJSMainModuleName(): String = "index"
@@ -0,0 +1,357 @@
+package com.ariacockpit
+
+import ai.onnxruntime.OnnxTensor
+import ai.onnxruntime.OrtEnvironment
+import ai.onnxruntime.OrtSession
+import android.Manifest
+import android.content.pm.PackageManager
+import android.media.AudioFormat
+import android.media.AudioRecord
+import android.media.MediaRecorder
+import android.util.Log
+import androidx.core.content.ContextCompat
+import com.facebook.react.bridge.Promise
+import com.facebook.react.bridge.ReactApplicationContext
+import com.facebook.react.bridge.ReactContextBaseJavaModule
+import com.facebook.react.bridge.ReactMethod
+import com.facebook.react.modules.core.DeviceEventManagerModule
+import java.nio.FloatBuffer
+import java.util.concurrent.atomic.AtomicBoolean
+
+/**
+ * Wake-Word Erkennung on-device via openWakeWord (https://github.com/dscripka/openWakeWord).
+ *
+ * Drei-stufige ONNX Pipeline:
+ *   1. Audio (16kHz mono int16, 1280-Sample-Chunks) → Melspectrogram → 32-mel Frames
+ *   2. 76 Mel-Frames Sliding Window (stride 8) → Speech-Embedding → 96-dim Vektor
+ *   3. Letzte 16 Embeddings (~1.28s Kontext) → Wake-Word-Klassifikator → Sigmoid-Score
+ *
+ *  Modelle liegen in assets/openwakeword/ (mel + embedding shared, plus pro Keyword
+ *  ein eigenes .onnx). Erkennung feuert nach `patience` aufeinanderfolgenden
+ *  Frames ueber `threshold` und unterdrueckt Wiederholungen fuer `debounceMs`.
+ *
+ *  Emittiert "WakeWordDetected" als RN-Event wenn ein Trigger erkannt wurde.
+ */
+class OpenWakeWordModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
+    override fun getName() = "OpenWakeWord"
+
+    companion object {
+        private const val TAG = "OpenWakeWord"
+        private const val SAMPLE_RATE = 16000
+        private const val CHUNK_SAMPLES = 1280               // 80ms @ 16kHz
+        private const val MEL_FRAMES_PER_EMBEDDING = 76      // Embedding-Fenster
+        private const val EMBEDDING_STRIDE = 8               // Slide um 8 Mel-Frames
+        private const val EMBEDDING_DIM = 96
+        private const val WW_INPUT_FRAMES = 16               // 16 Embeddings = ~1.28s
+        private const val MEL_BINS = 32
+    }
+
+    private val env: OrtEnvironment = OrtEnvironment.getEnvironment()
+    private var melSession: OrtSession? = null
+    private var embSession: OrtSession? = null
+    private var wwSession: OrtSession? = null
+
+    private var melInputName: String = "input"
+    private var embInputName: String = "input_1"
+    private var wwInputName: String = "input"
+
+    // Konfiguration
+    private var threshold: Float = 0.5f
+    private var patience: Int = 2
+    private var debounceMs: Long = 1500
+    private var modelName: String = "hey_jarvis"
+
+    // Audio-Capture-Thread
+    private var audioRecord: AudioRecord? = null
+    private val running = AtomicBoolean(false)
+    private var captureThread: Thread? = null
+
+    // Inferenz-State
+    private val melBuffer: ArrayList<FloatArray> = ArrayList(256)   // Liste von 32-dim Frames
+    private var melProcessedIdx: Int = 0
+    private val embBuffer: ArrayDeque<FloatArray> = ArrayDeque(32)  // Ringpuffer letzter Embeddings
+    private var consecutiveAboveThreshold: Int = 0
+    private var lastDetectionMs: Long = 0L
+
+    /**
+     *  Initialisiert die ONNX-Sessions fuer ein bestimmtes Wake-Word.
+     *  modelName: dateiname ohne Suffix (z.B. "hey_jarvis", "alexa", "hey_mycroft", "hey_rhasspy")
+     */
+    @ReactMethod
+    fun init(modelName: String, threshold: Double, patience: Int, debounceMs: Int, promise: Promise) {
+        try {
+            disposeSessions()
+            this.modelName = modelName
+            this.threshold = threshold.toFloat()
+            this.patience = patience.coerceAtLeast(1)
+            this.debounceMs = debounceMs.toLong()
+
+            val ctx = reactApplicationContext
+            val melBytes = ctx.assets.open("openwakeword/melspectrogram.onnx").use { it.readBytes() }
+            val embBytes = ctx.assets.open("openwakeword/embedding_model.onnx").use { it.readBytes() }
+            val wwBytes = ctx.assets.open("openwakeword/$modelName.onnx").use { it.readBytes() }
+
+            val opts = OrtSession.SessionOptions()
+            melSession = env.createSession(melBytes, opts)
+            embSession = env.createSession(embBytes, opts)
+            wwSession = env.createSession(wwBytes, opts)
+
+            melInputName = melSession!!.inputNames.first()
+            embInputName = embSession!!.inputNames.first()
+            wwInputName = wwSession!!.inputNames.first()
+
+            Log.i(TAG, "Init OK: model=$modelName threshold=$threshold patience=$patience " +
+                    "debounce=${debounceMs}ms (inputs: mel=$melInputName emb=$embInputName ww=$wwInputName)")
+            promise.resolve(true)
+        } catch (e: Exception) {
+            Log.e(TAG, "Init fehlgeschlagen: ${e.message}", e)
+            disposeSessions()
+            promise.reject("INIT_FAILED", e.message ?: "Unbekannter Fehler", e)
+        }
+    }
+
+    @ReactMethod
+    fun start(promise: Promise) {
+        if (running.get()) {
+            promise.resolve(true)
+            return
+        }
+        if (melSession == null || embSession == null || wwSession == null) {
+            promise.reject("NOT_INITIALIZED", "init() muss vor start() aufgerufen werden")
+            return
+        }
+        // Berechtigung pruefen — der App-Code holt die ueblicherweise schon vorher,
+        // aber wir bestehen hier explizit darauf damit AudioRecord nicht stumm
+        // failt.
+        val perm = ContextCompat.checkSelfPermission(reactApplicationContext, Manifest.permission.RECORD_AUDIO)
+        if (perm != PackageManager.PERMISSION_GRANTED) {
+            promise.reject("NO_MIC_PERMISSION", "RECORD_AUDIO Permission fehlt")
+            return
+        }
+
+        try {
+            val minBuf = AudioRecord.getMinBufferSize(
+                SAMPLE_RATE,
+                AudioFormat.CHANNEL_IN_MONO,
+                AudioFormat.ENCODING_PCM_16BIT,
+            ).coerceAtLeast(CHUNK_SAMPLES * 2 * 4)
+
+            val record = AudioRecord(
+                MediaRecorder.AudioSource.MIC,
+                SAMPLE_RATE,
+                AudioFormat.CHANNEL_IN_MONO,
+                AudioFormat.ENCODING_PCM_16BIT,
+                minBuf,
+            )
+            if (record.state != AudioRecord.STATE_INITIALIZED) {
+                record.release()
+                promise.reject("AUDIO_INIT", "AudioRecord nicht initialisiert (Mikro belegt?)")
+                return
+            }
+            audioRecord = record
+            resetInferenceState()
+            running.set(true)
+            record.startRecording()
+
+            captureThread = Thread({ captureLoop() }, "OpenWakeWordCapture").apply {
+                isDaemon = true
+                start()
+            }
+
+            Log.i(TAG, "Lauschen gestartet (model=$modelName)")
+            promise.resolve(true)
+        } catch (e: Exception) {
+            Log.e(TAG, "start fehlgeschlagen", e)
+            running.set(false)
+            audioRecord?.release()
+            audioRecord = null
+            promise.reject("START_FAILED", e.message ?: "Unbekannter Fehler", e)
+        }
+    }
+
+    @ReactMethod
+    fun stop(promise: Promise) {
+        running.set(false)
+        try {
+            captureThread?.join(1500)
+        } catch (_: InterruptedException) {}
+        captureThread = null
+        try { audioRecord?.stop() } catch (_: Exception) {}
+        try { audioRecord?.release() } catch (_: Exception) {}
+        audioRecord = null
+        Log.i(TAG, "Lauschen gestoppt")
+        promise.resolve(true)
+    }
+
+    @ReactMethod
+    fun dispose(promise: Promise) {
+        running.set(false)
+        try { captureThread?.join(1000) } catch (_: InterruptedException) {}
+        captureThread = null
+        try { audioRecord?.stop() } catch (_: Exception) {}
+        try { audioRecord?.release() } catch (_: Exception) {}
+        audioRecord = null
+        disposeSessions()
+        promise.resolve(true)
+    }
+
+    @ReactMethod
+    fun isAvailable(promise: Promise) {
+        // Wake-Word ist immer verfuegbar (kein API-Key, alles on-device)
+        promise.resolve(true)
+    }
+
+    // RN-Event-Subscriptions — RN-Konvention, sonst Warnung im Debug-Build
+    @ReactMethod fun addListener(eventName: String) {}
+    @ReactMethod fun removeListeners(count: Int) {}
+
+    private fun disposeSessions() {
+        try { melSession?.close() } catch (_: Exception) {}
+        try { embSession?.close() } catch (_: Exception) {}
+        try { wwSession?.close() } catch (_: Exception) {}
+        melSession = null
+        embSession = null
+        wwSession = null
+    }
+
+    private fun resetInferenceState() {
+        melBuffer.clear()
+        melProcessedIdx = 0
+        embBuffer.clear()
+        consecutiveAboveThreshold = 0
+        lastDetectionMs = 0L
+    }
+
+    private fun emitDetected() {
+        val params = com.facebook.react.bridge.Arguments.createMap().apply {
+            putString("model", modelName)
+        }
+        try {
+            reactApplicationContext
+                .getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
+                .emit("WakeWordDetected", params)
+        } catch (e: Exception) {
+            Log.w(TAG, "emit fehlgeschlagen: ${e.message}")
+        }
+    }
+
+    private fun captureLoop() {
+        val buf = ShortArray(CHUNK_SAMPLES)
+        val record = audioRecord ?: return
+        Log.i(TAG, "Capture-Loop gestartet")
+        while (running.get()) {
+            var read = 0
+            while (read < CHUNK_SAMPLES && running.get()) {
+                val n = record.read(buf, read, CHUNK_SAMPLES - read)
+                if (n <= 0) {
+                    Log.w(TAG, "AudioRecord.read returned $n — Loop ende")
+                    running.set(false)
+                    return
+                }
+                read += n
+            }
+            if (!running.get()) break
+            try {
+                processChunk(buf)
+            } catch (e: Exception) {
+                Log.w(TAG, "processChunk: ${e.message}")
+            }
+        }
+        Log.i(TAG, "Capture-Loop beendet")
+    }
+
+    /** Verarbeitet einen 1280-Sample int16 Audio-Chunk. */
+    private fun processChunk(audio: ShortArray) {
+        // 1) Audio → mel (output (1, 1, frames, 32))
+        val floats = FloatArray(audio.size) { audio[it].toFloat() }
+        val melTensor = OnnxTensor.createTensor(
+            env,
+            FloatBuffer.wrap(floats),
+            longArrayOf(1L, audio.size.toLong()),
+        )
+        val melResult = melSession!!.run(mapOf(melInputName to melTensor))
+        val melOut = melResult.get(0).value
+        melTensor.close()
+        @Suppress("UNCHECKED_CAST")
+        val mel4 = melOut as Array<Array<Array<FloatArray>>>
+        val frames = mel4[0][0]
+        // openWakeWord wendet `mel/10 + 2` an, bevor es ans Embedding-Modell geht
+        for (frame in frames) {
+            val scaled = FloatArray(frame.size) { frame[it] / 10f + 2f }
+            melBuffer.add(scaled)
+        }
+        melResult.close()
+
+        // 2) Sliding window: alle vollstaendigen 76-Frame-Fenster verarbeiten
+        while (melBuffer.size >= melProcessedIdx + MEL_FRAMES_PER_EMBEDDING) {
+            val flat = FloatArray(MEL_FRAMES_PER_EMBEDDING * MEL_BINS)
+            var pos = 0
+            for (i in 0 until MEL_FRAMES_PER_EMBEDDING) {
+                val src = melBuffer[melProcessedIdx + i]
+                System.arraycopy(src, 0, flat, pos, MEL_BINS)
+                pos += MEL_BINS
+            }
+            val embIn = OnnxTensor.createTensor(
+                env,
+                FloatBuffer.wrap(flat),
+                longArrayOf(1L, MEL_FRAMES_PER_EMBEDDING.toLong(), MEL_BINS.toLong(), 1L),
+            )
+            val embRes = embSession!!.run(mapOf(embInputName to embIn))
+            val embOut = embRes.get(0).value
+            embIn.close()
+            // Erwartete Output-Form: (1, 96) → Array<FloatArray>
+            @Suppress("UNCHECKED_CAST")
+            val embArr = embOut as Array<FloatArray>
+            embBuffer.addLast(embArr[0].copyOf())
+            while (embBuffer.size > WW_INPUT_FRAMES) embBuffer.removeFirst()
+            embRes.close()
+
+            melProcessedIdx += EMBEDDING_STRIDE
+        }
+        // Mel-Buffer trimmen — verhindert Memory-Wachstum
+        if (melProcessedIdx > MEL_FRAMES_PER_EMBEDDING) {
+            val keepFrom = melProcessedIdx - MEL_FRAMES_PER_EMBEDDING
+            val newList = ArrayList<FloatArray>(melBuffer.size - keepFrom)
+            for (i in keepFrom until melBuffer.size) newList.add(melBuffer[i])
+            melBuffer.clear()
+            melBuffer.addAll(newList)
+            melProcessedIdx = MEL_FRAMES_PER_EMBEDDING
+        }
+
+        // 3) Klassifikation — sobald wir 16 Embeddings haben
+        if (embBuffer.size < WW_INPUT_FRAMES) return
+        val flatEmb = FloatArray(WW_INPUT_FRAMES * EMBEDDING_DIM)
+        var p = 0
+        for (e in embBuffer) {
+            System.arraycopy(e, 0, flatEmb, p, EMBEDDING_DIM)
+            p += EMBEDDING_DIM
+        }
+        val wwIn = OnnxTensor.createTensor(
+            env,
+            FloatBuffer.wrap(flatEmb),
+            longArrayOf(1L, WW_INPUT_FRAMES.toLong(), EMBEDDING_DIM.toLong()),
+        )
+        val wwRes = wwSession!!.run(mapOf(wwInputName to wwIn))
+        val wwOut = wwRes.get(0).value
+        wwIn.close()
+        // Erwartete Output-Form: (1, 1) → Array<FloatArray>
+        @Suppress("UNCHECKED_CAST")
+        val score = (wwOut as Array<FloatArray>)[0][0]
+        wwRes.close()
+
+        if (score >= threshold) {
+            consecutiveAboveThreshold++
+            if (consecutiveAboveThreshold >= patience) {
+                val now = System.currentTimeMillis()
+                if (now - lastDetectionMs >= debounceMs) {
+                    lastDetectionMs = now
+                    consecutiveAboveThreshold = 0
+                    Log.i(TAG, "Wake-Word erkannt! score=$score model=$modelName")
+                    emitDetected()
+                }
+            }
+        } else {
+            consecutiveAboveThreshold = 0
+        }
+    }
+}
@@ -0,0 +1,16 @@
+package com.ariacockpit
+
+import com.facebook.react.ReactPackage
+import com.facebook.react.bridge.NativeModule
+import com.facebook.react.bridge.ReactApplicationContext
+import com.facebook.react.uimanager.ViewManager
+
+class OpenWakeWordPackage : ReactPackage {
+    override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
+        return listOf(OpenWakeWordModule(reactContext))
+    }
+
+    override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
+        return emptyList()
+    }
+}