From efb56e19f9cf597dbe5101c99577034be15349e5 Mon Sep 17 00:00:00 2001
From: msqr1 <repulseshipp@gmail.com>
Date: Thu, 25 Jan 2024 23:30:45 -0800
Subject: [PATCH] Recognizer design change

---
 README.md                     | 27 +++++++++++++----------
 compile.sh                    | 41 ++++++++++++++++++++---------------
 src/bindings.cc               |  4 ++--
 src/{preBefore.js => pre1.js} | 31 ++++++++++++++++----------
 src/{preMiddle.js => pre2.js} | 10 ++++++---
 src/pre3.js                   |  0
 src/preAfter.js               |  3 ---
 7 files changed, 67 insertions(+), 49 deletions(-)
 rename src/{preBefore.js => pre1.js} (65%)
 rename src/{preMiddle.js => pre2.js} (61%)
 create mode 100644 src/pre3.js
 delete mode 100644 src/preAfter.js
diff --git a/README.md b/README.md
index 785ab76..2f170a8 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,29 @@
 # Browser-recognizer
 - A speech recognizer built on Vosk that can be run on the browser, inspired by [vosk-browser](https://github.com/ccoreilly/vosk-browser), but built from scratch and no code taken!
 - Browser-recognizer can run both in the browser main thread and web workers
-- The API also designed to have strong exception safety
+- The API is also designed with strong exception safety
 ## Global and all objects' common interface 
 | Function signature (global) | Description |
 |---|---|
-| ```Promise makeModel(path: string, url: string, id: string)```<br><br>```Promise makeSpkModel(path: string, url: string, id: string)``` | - If **path** contains valid model files and **id** is the same, there will not be a fetch from **url**.<br>- If **path** doesn't contain valid model files, or if it contains valid model files but **id** is different, there will be a fetch from **url**, and the model is stored with **id**. |
+| ```Promise<Model> makeModel(path: string, url: string, id: string)```<br><br>```Promise<SpkModel> makeSpkModel(path: string, url: string, id: string)``` | Make a ```Model``` or ```SpkModel```<br>- If **path** contains valid model files and **id** is the same, there will not be a fetch from **url**.<br>- If **path** doesn't contain valid model files, or if it contains valid model files but **id** is different, there will be a fetch from **url**, and the model is stored with **id**. |
+| ```Promise<Recognizer> makeRecognizer(model: Model)``` | Make a ```Recognizer```, it will use a separate thread for recognition
 | ```setLogLevel(lvl: int)``` | Set Vosk's log level (default: -1) <br>- 2: Error<br>- 1: Warning<br>- 0: Info <br>- 1: Verbose<br>- 2: More verbose<br>- 3: Debug |
-| ```deleteAll()``` | Call ```delete()``` on all objects, it is recommended to put this at the end of the program to automatically clean up. See [why](https://emscripten.org/docs/getting_started/FAQ.html#what-does-exiting-the-runtime-mean-why-don-t-atexit-s-run).|
+| ```deleteAll()``` | Call ```delete()``` on all objects, it is recommended to run this at the API usage end to automatically clean up everything. See [why](https://emscripten.org/docs/getting_started/FAQ.html#what-does-exiting-the-runtime-mean-why-don-t-atexit-s-run).|
 
 | Function signature (all objects) | Description
 |---|---|
-| ```delete()``` | Delete (call C++ destructor on) this object
+| ```delete()``` | Delete this object
 ## ```Recognizer``` object
 | Function signature | Description |
 |---|---|
-| ```processAudio(ctx: AudioContext)``` | Recognize an audio chunk 
+| ```Promise<AudioWorkletNode> getNode(ctx: AudioContext)``` | Get a pass-through node that recognize audio and is connectable to a processing graph |
+| ```recognize(buf: AudioBuffer)``` | Recognize an AudioBuffer, usually from something like ```BaseAudioContext.decodeAudioData()```
 | ```setPartialWords(partialWords: bool)``` | Return words' information in a partialResult event (default: false) |
 | ```setWords(words: bool)``` | Return words' information in a result event (default: false) |
 | ```setNLSML(nlsml: bool)``` | Return result and partialResult in NLSML form (default: false) |
 | ```setMaxAlternatives(alts: int)``` | Set the max number of alternatives for result event (default: false) |
 | ```setGrm(grm: string)``` | Add grammar to the recognizer (default: none) |
-| ```setSpkModel(mdl: spkmodel)``` | Set the speaker model of the recognizer (default: none) |
+| ```setSpkModel(mdl: SpkModel)``` | Set the speaker model of the recognizer (default: none) |
 
 | Event | Description |
 |---|---|
@@ -32,13 +34,14 @@ Changing any setting to non-default values requires recompilation
 ```
 git clone --depth=1 https://github.com/msqr1/Browser-recognizer &&
 cd Browser-recognizer &&
-[Name]=[Value]... ./compile.sh
+[Options] ./compile.sh
 ```
-| Name | Description | Default value |
+| Option | Description | Default value |
 |---|---|---|
-| MAX_MEMORY | Set max memory (suffix mb, gb, or none for bytes) | 300mb, as [recommended](https://alphacephei.com/vosk/models) |
-| MAX_THREAD | Set the max number of thread (min: 2) | 2 (1 OPFS thread + 1 recognizer thread) |
-| EMSDK | Set EMSDK's path (will install EMSDK in root folder if unset) | **.** |
+| MAX_MEMORY | Set max memory, valid suffixes: kb, mb, gb, tb or none (bytes) | ```300mb```, as [recommended](https://alphacephei.com/vosk/models) |
+| MAX_THREADS | Set the max number of thread (2 min) | ```2``` (1 OPFS thread + 1 recognizer thread) |
+| COMPILE_JOBS | Set the number of jobs (threads) when compiling | ```$(nproc)```   |
+| EMSDK | Set EMSDK's path (will install EMSDK in root folder if unset) | ```.``` |
 ## Response headers
 Browser-recognizer require SharedArrayBuffer, so these response headers must be set:
 - ***Cross-Origin-Embedder-Policy*** ---> ***require-corp***
@@ -46,7 +49,7 @@ Browser-recognizer require SharedArrayBuffer, so these response headers must be
 
 If you can't set them, you may use a VERY HACKY workaround at *src/addCOI.js*.
 ## Additions to vosk-browser:
-- Can download multiple models
+- Download multiple models
 - Model storage path management (when many models are required)
 - Model ID management (when model updates are required)
 
diff --git a/compile.sh b/compile.sh
index ace4e73..a4c2664 100755
--- a/compile.sh
+++ b/compile.sh
@@ -4,14 +4,14 @@
   #     #  #  # #      # ##  #      # #   #     #      #  #  #     #     # #      #
   #     ####  #  #     ###   ####    #    ####  #####  ####  #     ####  #  #  ####
 
-
 # Total build time is around 45 minutes, mostly from building Kaldi
-
 sudo apt install shtool libtool autogen autotools-dev pkg-config make &&
 
-MAX_MEMORY?=300mb &&
-MAX_THREAD?=2 &&
-EMSDK?=$(realpath .) &&
+MAX_MEMORY=${MAX_MEMORY:-300mb} &&
+MAX_THREADS=${MAX_THREADS:-2} &&
+EMSDK=${EMSDK:-$(realpath .)} &&
+COMPILE_JOBS=${COMPILE_JOBS:-$(nproc)} &&
+
 SRC=$(realpath src) &&
 KALDI=$(realpath kaldi) &&
 VOSK=$(realpath vosk-api) &&
@@ -20,22 +20,27 @@ LIBARCHIVE=$(realpath libarchive) &&
 ZSTD=$(realpath zstd) && 
 CLAPACK_WASM=$(realpath clapack-wasm) &&
 
-if [[! -d $EMSDK_PATH] ]
+if [ ! -d $EMSDK_PATH ]; then
   echo "Invalid EMSDK path"
   exit 1
-if [[! $MAX_MEMORY =~ [0-9]+(mb|gb)* ]]
-  echo "Invalid MAX_MEMORY value"
+fi
+if [ $MAX_THREAD -lt 2 ]; then
+  echo "MAX_THREAD be greater or equal to 2" &&
   exit 1
-if [ $MAX_THREAD < 2 ] 
-  echo "MAX_THREAD is less than 2"
+fi
+if ! [[ $MAX_MEMORY =~ ^[0-9]+([kmgt]b)?$ ]]; then
+  echo "MAX_MEMORY valid suffixes are kb, mb, gb, tb, none (bytes)" &&
   exit 1
-if [$(realpath $EMSDK) = $(realpath .)]; then
+fi 
+if [ $(realpath $EMSDK) == $(realpath emsdk) ]; then
+  echo "EMSDK is current directory, installing emsdk and Emscripten..." &&
   git clone --depth=1 https://github.com/emscripten-core/emsdk.git &&
   cd emsdk &&
   ./emsdk install 3.1.51 &&
-  ./emsdk activate 3.1.51 &&
+  ./emsdk activate 3.1.51 
+fi
 
-source $EMSDK/emsdk_env.sh &&
+. $EMSDK/emsdk_env.sh &&
 export PATH=:$PATH:$EMSDK/upstream/bin &&
 
 rm -rf /tmp/zstd &&
@@ -51,13 +56,13 @@ git clone -b vosk --depth=1 https://github.com/alphacep/kaldi &&
 git clone -b go/v0.3.46 --depth=1 https://github.com/alphacep/vosk-api &&
 
 cd /tmp/zstd && 
-HAVE_THREAD=0 ZSTD_LEGACY_SUPPORT=0 HAVE_ZLIB=0 HAVE_LZMA=0 HAVE_LZ4=0 ZSTD_NOBENCH=1 ZSTD_NODICT=1 ZSTD_NOCOMPRESS=1 BACKTRACE=0 PREFIX=$SRC/zstd CPPFLAGS="-O3 -flto" LDFLAGS="-O3 -flto" emmake make install &&
+HAVE_THREAD=0 ZSTD_LEGACY_SUPPORT=0 HAVE_ZLIB=0 HAVE_LZMA=0 HAVE_LZ4=0 ZSTD_NOBENCH=1 ZSTD_NODICT=1 ZSTD_NOCOMPRESS=1 BACKTRACE=0 PREFIX=$SRC/zstd CPPFLAGS="-O3 -flto" LDFLAGS="-O3 -flto" emmake make -j$COMPILE_JOBS install &&
 rm -rf /tmp/zstd &&
 
 cd /tmp/libarchive && 
 build/autogen.sh && 
 CPPFLAGS="-I$ZSTD/include -flto" LDFLAGS="-L$ZSTD/lib -flto" emconfigure ./configure --prefix=$SRC/libarchive --without-lz4 --without-lzma --without-zlib --without-bz2lib --without-xml2 --without-expat --without-cng --without-openssl --without-libb2 --disable-bsdunzip --disable-xattr --disable-acl --disable-bsdcpio --disable-bsdcat --disable-rpath --disable-maintainer-mode --disable-dependency-tracking --enable-static --disable-shared && 
-emmake make install &&
+emmake make -j$COMPILE_JOBS install &&
 rm -rf /tmp/libarchive &&
 
 cd $CLAPACK_WASM &&
@@ -66,13 +71,13 @@ bash ./install_repo.sh emcc &&
 cd /tmp/openfst &&
 autoreconf -i &&
 CXXFLAGS="-pthread -r -O3 -flto"  LDFLAGS="-O3 -pthread -flto" emconfigure ./configure --prefix=$OPENFST --enable-static --disable-shared --enable-ngram-fsts --enable-lookahead-fsts --disable-bin --with-pic && 
-emmake make install &&
+emmake make -j$COMPILE_JOBS install &&
 echo "PACKAGE_VERSION = 1.8.0" >> $OPENFST/Makefile &&
 
 cd $KALDI/src &&
 git apply $SRC/kaldi.patch &&
 CXXFLAGS="-O3 -msimd128 -UHAVE_EXECINFO_H -pthread -flto" LDFLAGS="-O3 -sERROR_ON_UNDEFINED_SYMBOLS=0 -lembind -pthread -flto" emconfigure ./configure --use-cuda=no --with-cudadecoder=no --static --static-math=yes --static-fst=yes  --debug-level=0 --double-precision=yes --clapack-root=$CLAPACK_WASM --host=WASM && 
-emmake make online2 lm rnnlm &&
+emmake make -j$COMPILE_JOBS online2 lm rnnlm &&
 
 cd $VOSK/src &&
 git apply $SRC/vosk.patch &&
@@ -81,4 +86,4 @@ em++ -pthread -O3 -flto -Wno-deprecated -I. -I$KALDI/src -I$OPENFST/include $VOS
 emar -rcs vosk.a ${VOSK_FILES//.cc/.o} &&
 
 cd $SRC &&
-em++ -O3 genericModel.cc model.cc spkModel.cc recognizer.cc bindings.cc -sWASMFS -sWASM_BIGINT -sSINGLE_FILE -sEMBIND_STD_STRING_IS_UTF8 -sSUPPORT_LONGJMP=0 -sMODULARIZE -sEXPORT_NAME=loadBR -sENVIRONMENT=web,worker -sINITIAL_MAX_MEMORY=$MAX_MEMORY -sASYNCIFY -sPTHREAD_POOL_SIZE=$MAX_THREAD -sPTHREAD_POOL_SIZE_STRICT -sPTHREAD_POOL_DELAY_LOAD -sASYNCIFY_ONLY=['emscripten_wget'] -sALLOW_BLOCKING_ON_MAIN_THREAD=0 -sPOLYFILL=0 --pre-js preBefore.js --pre-js preMiddle.js --pre-js preAfter.js -I. -I$LIBARCHIVE/include -I$VOSK/src -L$LIBARCHIVE/lib -larchive -L$ZSTD/lib -lzstd -L$KALDI/src -l:online2/kaldi-online2.a -l:decoder/kaldi-decoder.a -l:ivector/kaldi-ivector.a -l:gmm/kaldi-gmm.a -l:tree/kaldi-tree.a -l:feat/kaldi-feat.a -l:cudamatrix/kaldi-cudamatrix.a -l:lat/kaldi-lat.a -l:lm/kaldi-lm.a -l:rnnlm/kaldi-rnnlm.a -l:hmm/kaldi-hmm.a -l:nnet3/kaldi-nnet3.a -l:transform/kaldi-transform.a -l:matrix/kaldi-matrix.a -l:fstext/kaldi-fstext.a -l:util/kaldi-util.a -l:base/kaldi-base.a -L$OPENFST/lib -l:libfst.a -l:libfstngram.a -L$CLAPACK_WASM -l:CBLAS/lib/cblas.a -l:CLAPACK-3.2.1/lapack.a -l:CLAPACK-3.2.1/libcblaswr.a -l:f2c_BLAS-3.8.0/blas.a -l:libf2c/libf2c.a -L$VOSK/src -l:vosk.a -lopfs.js -lembind -pthread -flto -o BrowserRecognizer.js  
+em++ -O3 genericModel.cc model.cc spkModel.cc recognizer.cc bindings.cc -sWASMFS -sWASM_BIGINT -sSINGLE_FILE -sEMBIND_STD_STRING_IS_UTF8 -sSUPPORT_LONGJMP=0 -sMODULARIZE -sEXPORT_NAME=loadBR -sENVIRONMENT=web,worker -sINITIAL_MEMORY=32pf -sASYNCIFY -sPTHREAD_POOL_SIZE=$MAX_THREAD -sPTHREAD_POOL_SIZE_STRICT -sPTHREAD_POOL_DELAY_LOAD -sASYNCIFY_ONLY=['emscripten_wget'] -sALLOW_BLOCKING_ON_MAIN_THREAD=0 -sPOLYFILL=0 --pre-js pre1.js --pre-js pre2.js --pre-js pre3.js -I. -I$LIBARCHIVE/include -I$VOSK/src -L$LIBARCHIVE/lib -larchive -L$ZSTD/lib -lzstd -L$KALDI/src -l:online2/kaldi-online2.a -l:decoder/kaldi-decoder.a -l:ivector/kaldi-ivector.a -l:gmm/kaldi-gmm.a -l:tree/kaldi-tree.a -l:feat/kaldi-feat.a -l:cudamatrix/kaldi-cudamatrix.a -l:lat/kaldi-lat.a -l:lm/kaldi-lm.a -l:rnnlm/kaldi-rnnlm.a -l:hmm/kaldi-hmm.a -l:nnet3/kaldi-nnet3.a -l:transform/kaldi-transform.a -l:matrix/kaldi-matrix.a -l:fstext/kaldi-fstext.a -l:util/kaldi-util.a -l:base/kaldi-base.a -L$OPENFST/lib -l:libfst.a -l:libfstngram.a -L$CLAPACK_WASM -l:CBLAS/lib/cblas.a -l:CLAPACK-3.2.1/lapack.a -l:CLAPACK-3.2.1/libcblaswr.a -l:f2c_BLAS-3.8.0/blas.a -l:libf2c/libf2c.a -L$VOSK/src -l:vosk.a -lopfs.js -lembind -pthread -flto -o BrowserRecognizer.js
diff --git a/src/bindings.cc b/src/bindings.cc
index fcf37b1..65d61a6 100644
--- a/src/bindings.cc
+++ b/src/bindings.cc
@@ -21,10 +21,10 @@ int main() {
 }
 EMSCRIPTEN_BINDINGS() {
   function("setLogLevel", &vosk_set_log_level, allow_raw_pointers());
-  class_<model>("model")
+  class_<model>("Model")
   .constructor<std::string, std::string, std::string>(allow_raw_pointers());
 
-  class_<spkModel>("spkModel")
+  class_<spkModel>("SpkModel")
   .constructor<std::string, std::string, std::string>(allow_raw_pointers());
   
   class_<recognizer>("recognizer") 
diff --git a/src/preBefore.js b/src/pre1.js
similarity index 65%
rename from src/preBefore.js
rename to src/pre1.js
index abdaca8..d66d632 100644
--- a/src/preBefore.js
+++ b/src/pre1.js
@@ -1,20 +1,28 @@
 let objs =  []
-class recognizer extends EventTarget {
-  constructor(rec,ctx) {
+class Recognizer extends EventTarget {
+  constructor(rec) {
     super()
     this.obj = rec
-    this.ptr = Module._malloc(512)
+    objs.push(this)
+  }
+  getNode(ctx) {
     let channel = new MessageChannel()
-    this.copier = new AudioWorkletNode(ctx, 'BRCopier', { channelCount: 1, numberOfInputs: 1, numberOfOutputs: 0 })
-    this.copier.port.postMessage({cmd : "init", ptr: this.ptr},[channel.port1])
+    this.node = new AudioWorkletNode(ctx, 'BRProcessor', { channelCount: 1, numberOfInputs: 1, numberOfOutputs: 1 })
+    node.port.postMessage({cmd : "init", ptr: this.ptr},[channel.port1])
     channel.port1.onmessage = (ev) => {
       this.obj.acceptWaveForm(this.ptr, 512)
     } 
-    objs.push(this)
+    return this.node
+  }
+  recognize(buf) {
+    buf.copyFromChannel()
+    this.obj.acceptWaveForm(this.ptr, 512)
   }
   delete() {
     this.obj.delete()
-    this.copier.port.postMessage({cmd : "deinit"})
+    if(typeof this.node !== "undefined") {
+      this.node.port.postMessage({cmd : "deinit"})
+    }
     Module.free(this.ptr)
   }
   setWords(words) {
@@ -43,7 +51,7 @@ Module.deleteAll = () => {
 Module.makeModel = async (url, path, id) => {
   let mdl
   try {
-    mdl = new Module.model(url, path, id)
+    mdl = new Module.Model(url, path, id)
   }
   catch(e) {
     mdl.delete()
@@ -55,7 +63,7 @@ Module.makeModel = async (url, path, id) => {
 Module.makeSpkModel = async (url, path, id) => {
   let mdl
   try {
-    mdl = new Module.spkModel(url, path, id)
+    mdl = new Module.SpkModel(url, path, id)
   }
   catch(e) {
     mdl.delete()
@@ -64,7 +72,7 @@ Module.makeSpkModel = async (url, path, id) => {
   objs.push(mdl)
   return mdl
 }, ctx.AudioWorklet
-Module.makeRecognizer = async (model, ctx) => {
+Module.makeRecognizer = async (model) => {
   let rec
   try {
     rec = new Module.recognizer(model, ctx.sampleRate, objs.length)
@@ -73,4 +81,5 @@ Module.makeRecognizer = async (model, ctx) => {
     rec.delete()
     return Promise.reject(e)
   }
-  await ctx.AudioWorklet.addModule(URL.createObjectURL(new Blob([`
+  return new Recognizer(rec)
+}
diff --git a/src/preMiddle.js b/src/pre2.js
similarity index 61%
rename from src/preMiddle.js
rename to src/pre2.js
index 554c308..28e7bbe 100644
--- a/src/preMiddle.js
+++ b/src/pre2.js
@@ -1,4 +1,5 @@
-registerProcessor("BRCopier", class extends AudioWorkletProcessor {
+// A copy and pass processor
+registerProcessor("BRProcessor", class extends AudioWorkletProcessor {
   constructor(options) {
     super(options)
     this.ret = true
@@ -7,6 +8,8 @@ registerProcessor("BRCopier", class extends AudioWorkletProcessor {
         case "init":
           this.recognizerPort = ev.ports[0]
           this.wasmMem = new Float32Array(WebAssembly.Memory.buffer).subarray(ev.ptr, ev.ptr+512)
+          this.channel = ev.channel
+          this.input = ev.input
           break
         case "deinit":
           this.ret = false
@@ -16,8 +19,9 @@ registerProcessor("BRCopier", class extends AudioWorkletProcessor {
   }
   process(inputs, outputs, params) {
     if(!this.ret) return false;
-    inputs[0].copyFromChannel(this.wasmMem, 0)
-    this.recognizerPort.postMessage("done")
+    inputs[this.input].copyFromChannel(this.wasmMem, this.channel)
+    outputs = inputs
+    this.recognizerPort.postMessage(".") // A
     return true
   }
 })
\ No newline at end of file
diff --git a/src/pre3.js b/src/pre3.js
new file mode 100644
index 0000000..e69de29
diff --git a/src/preAfter.js b/src/preAfter.js
deleted file mode 100644
index ac57dfa..0000000
--- a/src/preAfter.js
+++ /dev/null
@@ -1,3 +0,0 @@
-  `])))
-  return new recognizer(rec,ctx)
-}
\ No newline at end of file