feat(app): Wake-Word komplett on-device via openWakeWord (ONNX)
Picovoice/Porcupine raus — neuer Stack ist openWakeWord (Apache 2.0, on-device, ONNX Runtime). Kein API-Key, keine Lizenzgebuehren, Audio verlaesst das Geraet nicht. Eigene Wake-Words sind via openWakeWord- Notebook gratis trainierbar. Pipeline (alles im OpenWakeWordModule.kt): 1. AudioRecord 16kHz mono int16 in 1280-Sample-Chunks (80ms) 2. melspectrogram.onnx → 32-mel Frames (mel/10 + 2 wie in Python) 3. embedding_model.onnx, 76-Frame Sliding Window (stride 8) → 96-dim 4. hey_jarvis.onnx (oder anderes Keyword) auf letzten 16 Embeddings 5. Sigmoid-Score, threshold/patience/debounce-Filter 6. RN-Event "WakeWordDetected" raus Mitgelieferte Modelle in assets/openwakeword/: hey_jarvis (Default), alexa, hey_mycroft, hey_rhasspy. Externe Service-API (start/stop/ configure/onWakeWord/...) bleibt identisch — ChatScreen unveraendert. build.gradle: com.microsoft.onnxruntime:onnxruntime-android:1.17.1 package.json: @picovoice/porcupine-react-native + voice-processor raus SettingsScreen: AccessKey-Feld weg, neue Keyword-Liste mit Labels README: Wake-Word-Sektion komplett umgeschrieben (kein Picovoice mehr) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ class MainApplication : Application(), ReactApplication {
|
||||
add(ApkInstallerPackage())
|
||||
add(AudioFocusPackage())
|
||||
add(PcmStreamPlayerPackage())
|
||||
add(OpenWakeWordPackage())
|
||||
}
|
||||
|
||||
override fun getJSMainModuleName(): String = "index"
|
||||
|
||||
@@ -0,0 +1,357 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import ai.onnxruntime.OnnxTensor
|
||||
import ai.onnxruntime.OrtEnvironment
|
||||
import ai.onnxruntime.OrtSession
|
||||
import android.Manifest
|
||||
import android.content.pm.PackageManager
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioRecord
|
||||
import android.media.MediaRecorder
|
||||
import android.util.Log
|
||||
import androidx.core.content.ContextCompat
|
||||
import com.facebook.react.bridge.Promise
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.bridge.ReactContextBaseJavaModule
|
||||
import com.facebook.react.bridge.ReactMethod
|
||||
import com.facebook.react.modules.core.DeviceEventManagerModule
|
||||
import java.nio.FloatBuffer
|
||||
import java.util.concurrent.atomic.AtomicBoolean
|
||||
|
||||
/**
|
||||
* Wake-Word Erkennung on-device via openWakeWord (https://github.com/dscripka/openWakeWord).
|
||||
*
|
||||
* Drei-stufige ONNX Pipeline:
|
||||
* 1. Audio (16kHz mono int16, 1280-Sample-Chunks) → Melspectrogram → 32-mel Frames
|
||||
* 2. 76 Mel-Frames Sliding Window (stride 8) → Speech-Embedding → 96-dim Vektor
|
||||
* 3. Letzte 16 Embeddings (~1.28s Kontext) → Wake-Word-Klassifikator → Sigmoid-Score
|
||||
*
|
||||
* Modelle liegen in assets/openwakeword/ (mel + embedding shared, plus pro Keyword
|
||||
* ein eigenes .onnx). Erkennung feuert nach `patience` aufeinanderfolgenden
|
||||
* Frames ueber `threshold` und unterdrueckt Wiederholungen fuer `debounceMs`.
|
||||
*
|
||||
* Emittiert "WakeWordDetected" als RN-Event wenn ein Trigger erkannt wurde.
|
||||
*/
|
||||
class OpenWakeWordModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
|
||||
override fun getName() = "OpenWakeWord"
|
||||
|
||||
companion object {
|
||||
private const val TAG = "OpenWakeWord"
|
||||
private const val SAMPLE_RATE = 16000
|
||||
private const val CHUNK_SAMPLES = 1280 // 80ms @ 16kHz
|
||||
private const val MEL_FRAMES_PER_EMBEDDING = 76 // Embedding-Fenster
|
||||
private const val EMBEDDING_STRIDE = 8 // Slide um 8 Mel-Frames
|
||||
private const val EMBEDDING_DIM = 96
|
||||
private const val WW_INPUT_FRAMES = 16 // 16 Embeddings = ~1.28s
|
||||
private const val MEL_BINS = 32
|
||||
}
|
||||
|
||||
private val env: OrtEnvironment = OrtEnvironment.getEnvironment()
|
||||
private var melSession: OrtSession? = null
|
||||
private var embSession: OrtSession? = null
|
||||
private var wwSession: OrtSession? = null
|
||||
|
||||
private var melInputName: String = "input"
|
||||
private var embInputName: String = "input_1"
|
||||
private var wwInputName: String = "input"
|
||||
|
||||
// Konfiguration
|
||||
private var threshold: Float = 0.5f
|
||||
private var patience: Int = 2
|
||||
private var debounceMs: Long = 1500
|
||||
private var modelName: String = "hey_jarvis"
|
||||
|
||||
// Audio-Capture-Thread
|
||||
private var audioRecord: AudioRecord? = null
|
||||
private val running = AtomicBoolean(false)
|
||||
private var captureThread: Thread? = null
|
||||
|
||||
// Inferenz-State
|
||||
private val melBuffer: ArrayList<FloatArray> = ArrayList(256) // Liste von 32-dim Frames
|
||||
private var melProcessedIdx: Int = 0
|
||||
private val embBuffer: ArrayDeque<FloatArray> = ArrayDeque(32) // Ringpuffer letzter Embeddings
|
||||
private var consecutiveAboveThreshold: Int = 0
|
||||
private var lastDetectionMs: Long = 0L
|
||||
|
||||
/**
|
||||
* Initialisiert die ONNX-Sessions fuer ein bestimmtes Wake-Word.
|
||||
* modelName: dateiname ohne Suffix (z.B. "hey_jarvis", "alexa", "hey_mycroft", "hey_rhasspy")
|
||||
*/
|
||||
@ReactMethod
|
||||
fun init(modelName: String, threshold: Double, patience: Int, debounceMs: Int, promise: Promise) {
|
||||
try {
|
||||
disposeSessions()
|
||||
this.modelName = modelName
|
||||
this.threshold = threshold.toFloat()
|
||||
this.patience = patience.coerceAtLeast(1)
|
||||
this.debounceMs = debounceMs.toLong()
|
||||
|
||||
val ctx = reactApplicationContext
|
||||
val melBytes = ctx.assets.open("openwakeword/melspectrogram.onnx").use { it.readBytes() }
|
||||
val embBytes = ctx.assets.open("openwakeword/embedding_model.onnx").use { it.readBytes() }
|
||||
val wwBytes = ctx.assets.open("openwakeword/$modelName.onnx").use { it.readBytes() }
|
||||
|
||||
val opts = OrtSession.SessionOptions()
|
||||
melSession = env.createSession(melBytes, opts)
|
||||
embSession = env.createSession(embBytes, opts)
|
||||
wwSession = env.createSession(wwBytes, opts)
|
||||
|
||||
melInputName = melSession!!.inputNames.first()
|
||||
embInputName = embSession!!.inputNames.first()
|
||||
wwInputName = wwSession!!.inputNames.first()
|
||||
|
||||
Log.i(TAG, "Init OK: model=$modelName threshold=$threshold patience=$patience " +
|
||||
"debounce=${debounceMs}ms (inputs: mel=$melInputName emb=$embInputName ww=$wwInputName)")
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Init fehlgeschlagen: ${e.message}", e)
|
||||
disposeSessions()
|
||||
promise.reject("INIT_FAILED", e.message ?: "Unbekannter Fehler", e)
|
||||
}
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun start(promise: Promise) {
|
||||
if (running.get()) {
|
||||
promise.resolve(true)
|
||||
return
|
||||
}
|
||||
if (melSession == null || embSession == null || wwSession == null) {
|
||||
promise.reject("NOT_INITIALIZED", "init() muss vor start() aufgerufen werden")
|
||||
return
|
||||
}
|
||||
// Berechtigung pruefen — der App-Code holt die ueblicherweise schon vorher,
|
||||
// aber wir bestehen hier explizit darauf damit AudioRecord nicht stumm
|
||||
// failt.
|
||||
val perm = ContextCompat.checkSelfPermission(reactApplicationContext, Manifest.permission.RECORD_AUDIO)
|
||||
if (perm != PackageManager.PERMISSION_GRANTED) {
|
||||
promise.reject("NO_MIC_PERMISSION", "RECORD_AUDIO Permission fehlt")
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
val minBuf = AudioRecord.getMinBufferSize(
|
||||
SAMPLE_RATE,
|
||||
AudioFormat.CHANNEL_IN_MONO,
|
||||
AudioFormat.ENCODING_PCM_16BIT,
|
||||
).coerceAtLeast(CHUNK_SAMPLES * 2 * 4)
|
||||
|
||||
val record = AudioRecord(
|
||||
MediaRecorder.AudioSource.MIC,
|
||||
SAMPLE_RATE,
|
||||
AudioFormat.CHANNEL_IN_MONO,
|
||||
AudioFormat.ENCODING_PCM_16BIT,
|
||||
minBuf,
|
||||
)
|
||||
if (record.state != AudioRecord.STATE_INITIALIZED) {
|
||||
record.release()
|
||||
promise.reject("AUDIO_INIT", "AudioRecord nicht initialisiert (Mikro belegt?)")
|
||||
return
|
||||
}
|
||||
audioRecord = record
|
||||
resetInferenceState()
|
||||
running.set(true)
|
||||
record.startRecording()
|
||||
|
||||
captureThread = Thread({ captureLoop() }, "OpenWakeWordCapture").apply {
|
||||
isDaemon = true
|
||||
start()
|
||||
}
|
||||
|
||||
Log.i(TAG, "Lauschen gestartet (model=$modelName)")
|
||||
promise.resolve(true)
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "start fehlgeschlagen", e)
|
||||
running.set(false)
|
||||
audioRecord?.release()
|
||||
audioRecord = null
|
||||
promise.reject("START_FAILED", e.message ?: "Unbekannter Fehler", e)
|
||||
}
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun stop(promise: Promise) {
|
||||
running.set(false)
|
||||
try {
|
||||
captureThread?.join(1500)
|
||||
} catch (_: InterruptedException) {}
|
||||
captureThread = null
|
||||
try { audioRecord?.stop() } catch (_: Exception) {}
|
||||
try { audioRecord?.release() } catch (_: Exception) {}
|
||||
audioRecord = null
|
||||
Log.i(TAG, "Lauschen gestoppt")
|
||||
promise.resolve(true)
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun dispose(promise: Promise) {
|
||||
running.set(false)
|
||||
try { captureThread?.join(1000) } catch (_: InterruptedException) {}
|
||||
captureThread = null
|
||||
try { audioRecord?.stop() } catch (_: Exception) {}
|
||||
try { audioRecord?.release() } catch (_: Exception) {}
|
||||
audioRecord = null
|
||||
disposeSessions()
|
||||
promise.resolve(true)
|
||||
}
|
||||
|
||||
@ReactMethod
|
||||
fun isAvailable(promise: Promise) {
|
||||
// Wake-Word ist immer verfuegbar (kein API-Key, alles on-device)
|
||||
promise.resolve(true)
|
||||
}
|
||||
|
||||
// RN-Event-Subscriptions — RN-Konvention, sonst Warnung im Debug-Build
|
||||
@ReactMethod fun addListener(eventName: String) {}
|
||||
@ReactMethod fun removeListeners(count: Int) {}
|
||||
|
||||
private fun disposeSessions() {
|
||||
try { melSession?.close() } catch (_: Exception) {}
|
||||
try { embSession?.close() } catch (_: Exception) {}
|
||||
try { wwSession?.close() } catch (_: Exception) {}
|
||||
melSession = null
|
||||
embSession = null
|
||||
wwSession = null
|
||||
}
|
||||
|
||||
private fun resetInferenceState() {
|
||||
melBuffer.clear()
|
||||
melProcessedIdx = 0
|
||||
embBuffer.clear()
|
||||
consecutiveAboveThreshold = 0
|
||||
lastDetectionMs = 0L
|
||||
}
|
||||
|
||||
private fun emitDetected() {
|
||||
val params = com.facebook.react.bridge.Arguments.createMap().apply {
|
||||
putString("model", modelName)
|
||||
}
|
||||
try {
|
||||
reactApplicationContext
|
||||
.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
|
||||
.emit("WakeWordDetected", params)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "emit fehlgeschlagen: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
private fun captureLoop() {
|
||||
val buf = ShortArray(CHUNK_SAMPLES)
|
||||
val record = audioRecord ?: return
|
||||
Log.i(TAG, "Capture-Loop gestartet")
|
||||
while (running.get()) {
|
||||
var read = 0
|
||||
while (read < CHUNK_SAMPLES && running.get()) {
|
||||
val n = record.read(buf, read, CHUNK_SAMPLES - read)
|
||||
if (n <= 0) {
|
||||
Log.w(TAG, "AudioRecord.read returned $n — Loop ende")
|
||||
running.set(false)
|
||||
return
|
||||
}
|
||||
read += n
|
||||
}
|
||||
if (!running.get()) break
|
||||
try {
|
||||
processChunk(buf)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "processChunk: ${e.message}")
|
||||
}
|
||||
}
|
||||
Log.i(TAG, "Capture-Loop beendet")
|
||||
}
|
||||
|
||||
/** Verarbeitet einen 1280-Sample int16 Audio-Chunk. */
|
||||
private fun processChunk(audio: ShortArray) {
|
||||
// 1) Audio → mel (output (1, 1, frames, 32))
|
||||
val floats = FloatArray(audio.size) { audio[it].toFloat() }
|
||||
val melTensor = OnnxTensor.createTensor(
|
||||
env,
|
||||
FloatBuffer.wrap(floats),
|
||||
longArrayOf(1L, audio.size.toLong()),
|
||||
)
|
||||
val melResult = melSession!!.run(mapOf(melInputName to melTensor))
|
||||
val melOut = melResult.get(0).value
|
||||
melTensor.close()
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val mel4 = melOut as Array<Array<Array<FloatArray>>>
|
||||
val frames = mel4[0][0]
|
||||
// openWakeWord wendet `mel/10 + 2` an, bevor es ans Embedding-Modell geht
|
||||
for (frame in frames) {
|
||||
val scaled = FloatArray(frame.size) { frame[it] / 10f + 2f }
|
||||
melBuffer.add(scaled)
|
||||
}
|
||||
melResult.close()
|
||||
|
||||
// 2) Sliding window: alle vollstaendigen 76-Frame-Fenster verarbeiten
|
||||
while (melBuffer.size >= melProcessedIdx + MEL_FRAMES_PER_EMBEDDING) {
|
||||
val flat = FloatArray(MEL_FRAMES_PER_EMBEDDING * MEL_BINS)
|
||||
var pos = 0
|
||||
for (i in 0 until MEL_FRAMES_PER_EMBEDDING) {
|
||||
val src = melBuffer[melProcessedIdx + i]
|
||||
System.arraycopy(src, 0, flat, pos, MEL_BINS)
|
||||
pos += MEL_BINS
|
||||
}
|
||||
val embIn = OnnxTensor.createTensor(
|
||||
env,
|
||||
FloatBuffer.wrap(flat),
|
||||
longArrayOf(1L, MEL_FRAMES_PER_EMBEDDING.toLong(), MEL_BINS.toLong(), 1L),
|
||||
)
|
||||
val embRes = embSession!!.run(mapOf(embInputName to embIn))
|
||||
val embOut = embRes.get(0).value
|
||||
embIn.close()
|
||||
// Erwartete Output-Form: (1, 96) → Array<FloatArray>
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val embArr = embOut as Array<FloatArray>
|
||||
embBuffer.addLast(embArr[0].copyOf())
|
||||
while (embBuffer.size > WW_INPUT_FRAMES) embBuffer.removeFirst()
|
||||
embRes.close()
|
||||
|
||||
melProcessedIdx += EMBEDDING_STRIDE
|
||||
}
|
||||
// Mel-Buffer trimmen — verhindert Memory-Wachstum
|
||||
if (melProcessedIdx > MEL_FRAMES_PER_EMBEDDING) {
|
||||
val keepFrom = melProcessedIdx - MEL_FRAMES_PER_EMBEDDING
|
||||
val newList = ArrayList<FloatArray>(melBuffer.size - keepFrom)
|
||||
for (i in keepFrom until melBuffer.size) newList.add(melBuffer[i])
|
||||
melBuffer.clear()
|
||||
melBuffer.addAll(newList)
|
||||
melProcessedIdx = MEL_FRAMES_PER_EMBEDDING
|
||||
}
|
||||
|
||||
// 3) Klassifikation — sobald wir 16 Embeddings haben
|
||||
if (embBuffer.size < WW_INPUT_FRAMES) return
|
||||
val flatEmb = FloatArray(WW_INPUT_FRAMES * EMBEDDING_DIM)
|
||||
var p = 0
|
||||
for (e in embBuffer) {
|
||||
System.arraycopy(e, 0, flatEmb, p, EMBEDDING_DIM)
|
||||
p += EMBEDDING_DIM
|
||||
}
|
||||
val wwIn = OnnxTensor.createTensor(
|
||||
env,
|
||||
FloatBuffer.wrap(flatEmb),
|
||||
longArrayOf(1L, WW_INPUT_FRAMES.toLong(), EMBEDDING_DIM.toLong()),
|
||||
)
|
||||
val wwRes = wwSession!!.run(mapOf(wwInputName to wwIn))
|
||||
val wwOut = wwRes.get(0).value
|
||||
wwIn.close()
|
||||
// Erwartete Output-Form: (1, 1) → Array<FloatArray>
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
val score = (wwOut as Array<FloatArray>)[0][0]
|
||||
wwRes.close()
|
||||
|
||||
if (score >= threshold) {
|
||||
consecutiveAboveThreshold++
|
||||
if (consecutiveAboveThreshold >= patience) {
|
||||
val now = System.currentTimeMillis()
|
||||
if (now - lastDetectionMs >= debounceMs) {
|
||||
lastDetectionMs = now
|
||||
consecutiveAboveThreshold = 0
|
||||
Log.i(TAG, "Wake-Word erkannt! score=$score model=$modelName")
|
||||
emitDetected()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
consecutiveAboveThreshold = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package com.ariacockpit
|
||||
|
||||
import com.facebook.react.ReactPackage
|
||||
import com.facebook.react.bridge.NativeModule
|
||||
import com.facebook.react.bridge.ReactApplicationContext
|
||||
import com.facebook.react.uimanager.ViewManager
|
||||
|
||||
class OpenWakeWordPackage : ReactPackage {
|
||||
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
|
||||
return listOf(OpenWakeWordModule(reactContext))
|
||||
}
|
||||
|
||||
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
|
||||
return emptyList()
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user