From 29ec3348bf3fbe2a49d592a7fcdb672147ae0d4a Mon Sep 17 00:00:00 2001
From: Joseph Yiu <77114984+joseph-yiu@users.noreply.github.com>
Date: Tue, 11 Jun 2024 15:42:33 +0100
Subject: [PATCH 1/7] Change KWS to use noise to signal ratio for result
 checking, and change reference from Signal-to-Noise to Noise-to-Signal in
 other unit tests.

---
 tests/test_abf_f32.c |   8 +-
 tests/test_aec_f32.c |   8 +-
 tests/test_anr_f32.c |   8 +-
 tests/test_kws.c     | 306 ++++++++++++++++++++++++++-----------------
 4 files changed, 203 insertions(+), 127 deletions(-)

diff --git a/tests/test_abf_f32.c b/tests/test_abf_f32.c
index 1183614..c9ee548 100644
--- a/tests/test_abf_f32.c
+++ b/tests/test_abf_f32.c
@@ -1,4 +1,5 @@
 /**
+ * Copyright (C) 2024 SPEC Embedded Group
  * Copyright (C) 2022 EEMBC
  *
  * All EEMBC Benchmark Software are products of EEMBC and are provided under the
@@ -19,7 +20,8 @@
 #define NSAMPLES      256U
 #define NFRAMEBYTES   512U
 
-#define SNRM50DB 0.003162f
+/* Noise to signal ratio */
+#define NSRM50DB 0.003162f
 
 extern const int16_t p_channel1[TEST_NBUFFERS][NSAMPLES];
 extern const int16_t p_channel2[TEST_NBUFFERS][NSAMPLES];
@@ -108,10 +110,10 @@ main(int argc, char *argv[])
         }
 
         ratio = (float)B / (float)A;
-        if (ratio > SNRM50DB)
+        if (ratio > NSRM50DB)
         {
             err = true;
-            printf("ABF FAIL: Frame #%d exceeded -50 dB SNR\n", i);
+            printf("ABF FAIL: Frame #%d exceeded -50 dB Noise-to-Signal ratio\n", i);
         }
     }
 
diff --git a/tests/test_aec_f32.c b/tests/test_aec_f32.c
index ac6706a..37896ef 100644
--- a/tests/test_aec_f32.c
+++ b/tests/test_aec_f32.c
@@ -1,4 +1,5 @@
 /**
+ * Copyright (C) 2024 SPEC Embedded Group
  * Copyright (C) 2022 EEMBC
  *
  * All EEMBC Benchmark Software are products of EEMBC and are provided under the
@@ -19,7 +20,8 @@
 #define NSAMPLES      256U
 #define NFRAMEBYTES   512U
 
-#define SNRM50DB 0.003162f
+/* Noise to signal ratio */
+#define NSRM50DB 0.003162f
 
 extern const int16_t p_input[TEST_NBUFFERS][NSAMPLES];
 extern const int16_t p_echo[TEST_NBUFFERS][NSAMPLES];
@@ -118,10 +120,10 @@ main(int argc, char *argv[])
         }
 
         ratio = (float)B / (float)A;
-        if (ratio > SNRM50DB)
+        if (ratio > NSRM50DB)
         {
             err = true;
-            printf("AEC FAIL: Frame #%d exceeded -50 dB SNR\n", i);
+            printf("AEC FAIL: Frame #%d exceeded -50 dB Noise-to-Signal ratio\n", i);
         }
     }
 
diff --git a/tests/test_anr_f32.c b/tests/test_anr_f32.c
index d1028db..f986180 100644
--- a/tests/test_anr_f32.c
+++ b/tests/test_anr_f32.c
@@ -1,4 +1,5 @@
 /**
+ * Copyright (C) 2024 SPEC Embedded Group
  * Copyright (C) 2022 EEMBC
  *
  * All EEMBC Benchmark Software are products of EEMBC and are provided under the
@@ -19,7 +20,8 @@
 #define NSAMPLES      256U
 #define NFRAMEBYTES   512U
 
-#define SNRM50DB 0.003162f
+/* Noise to signal ratio */
+#define NSRM50DB 0.003162f
 
 extern const int16_t p_input[TEST_NBUFFERS][NSAMPLES];
 extern const int16_t p_expected[TEST_NBUFFERS][NSAMPLES];
@@ -111,10 +113,10 @@ main(int argc, char *argv[])
         }
 
         ratio = (float)B / (float)A;
-        if (ratio > SNRM50DB)
+        if (ratio > NSRM50DB)
         {
             err = true;
-            printf("ANR FAIL: Frame #%d exceeded -50 dB SNR\n", i);
+            printf("ANR FAIL: Frame #%d exceeded -50 dB Noise-to-Signal ratio\n", i);
         }
     }
 
diff --git a/tests/test_kws.c b/tests/test_kws.c
index d0d72cf..b586f45 100644
--- a/tests/test_kws.c
+++ b/tests/test_kws.c
@@ -1,118 +1,188 @@
-/**
- * Copyright (C) 2022 EEMBC
- *
- * All EEMBC Benchmark Software are products of EEMBC and are provided under the
- * terms of the EEMBC Benchmark License Agreements. The EEMBC Benchmark Software
- * are proprietary intellectual properties of EEMBC and its Members and is
- * protected under all applicable laws, including all applicable copyright laws.
- *
- * If you received this EEMBC Benchmark Software without having a currently
- * effective EEMBC Benchmark License Agreement, you must discontinue use.
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "ee_types.h"
-#include "ee_audiomark.h"
-
-#define NBUFFERS 93
-#define NINFERS  73
-#define NSAMPLES 256
-#define NCLASSES 12
-
-extern const int16_t p_input[NBUFFERS][NSAMPLES];
-extern const int8_t  p_expected[NINFERS][NCLASSES];
-
-// Used deep inside audiomark core
-char *spxGlobalHeapPtr;
-char *spxGlobalHeapEnd;
-
-int32_t ee_kws_f32(int32_t command,
-                   void  **pp_instance,
-                   void   *p_data,
-                   void   *p_params);
-
-static int16_t        aec_output[256];     // 5
-static int16_t        audio_fifo[13 * 64]; // 6
-static int8_t         mfcc_fifo[490];      // 7
-static int8_t         classes[12];         // 8
-static xdais_buffer_t xdais[4];
-
-int
-main(int argc, char *argv[])
-{
-    int           err           = 0;
-    int           new_inference = 0;
-    const int8_t *p_check       = NULL;
-    int           idx_check     = 0;
-    uint32_t      memreq        = 0;
-    uint32_t     *p_req         = &memreq;
-    void         *memory        = NULL;
-    void         *inst          = NULL;
-
-    int inferences = 0;
-
-    ee_kws_f32(NODE_MEMREQ, (void **)&p_req, NULL, NULL);
-
-    printf("KWS F32 MEMREQ = %d bytes\n", memreq);
-    memory = malloc(memreq);
-    if (!memory)
-    {
-        printf("malloc() fail\n");
-        return -1;
-    }
-    inst = (void *)memory;
-    SETUP_XDAIS(xdais[0], aec_output, 512);
-    SETUP_XDAIS(xdais[1], audio_fifo, 13 * 64 * 2);
-    SETUP_XDAIS(xdais[2], mfcc_fifo, 490);
-    SETUP_XDAIS(xdais[3], classes, 12);
-
-    ee_kws_f32(NODE_RESET, (void **)&inst, NULL, NULL);
-
-    for (int i = 0; i < NBUFFERS; ++i)
-    {
-        memcpy(aec_output, p_input[i], 512 /* 256 samples @ 2bytes@ */);
-        ee_kws_f32(NODE_RUN, (void **)&inst, xdais, &new_inference);
-
-        if (new_inference)
-        {
-            ++inferences;
-            p_check = p_expected[idx_check];
-            ++idx_check;
-
-            for (int j = 0; j < NCLASSES; ++j)
-            {
-                if (classes[j] != p_check[j])
-                {
-                    err = 1;
-                    printf("buffer[%d]class[%d]: Got %d, expected %d - FAIL\n",
-                           i,
-                           j,
-                           classes[j],
-                           p_check[j]);
-                }
-            }
-        }
-    }
-
-    if (inferences == 0)
-    {
-        err = 1;
-        printf("KWS did not perform any inferences\n");
-    }
-
-    if (inferences != NINFERS)
-    {
-        err = 1;
-        printf("KWS expected %d inferences but got %d\n", inferences, NINFERS);
-    }
-
-    if (err)
-    {
-        printf("KWS test failed\n");
-        return -1;
-    }
-
-    printf("KWS test passed\n");
-    return 0;
-}
+/**
+ * Copyright (C) 2024 SPEC Embedded Group
+ * Copyright (C) 2022 EEMBC
+ *
+ * All EEMBC Benchmark Software are products of EEMBC and are provided under the
+ * terms of the EEMBC Benchmark License Agreements. The EEMBC Benchmark Software
+ * are proprietary intellectual properties of EEMBC and its Members and is
+ * protected under all applicable laws, including all applicable copyright laws.
+ *
+ * If you received this EEMBC Benchmark Software without having a currently
+ * effective EEMBC Benchmark License Agreement, you must discontinue use.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include "ee_types.h"
+#include "ee_audiomark.h"
+
+#define NBUFFERS 93
+#define NINFERS  73
+#define NSAMPLES 256
+#define NCLASSES 12
+
+/* Noise to signal ratio */
+#define NSRM50DB 0.003162f
+#define NSRM40DB 0.01f
+#define NSRM35DB 0.017783f
+#define NSRM30DB 0.03162f
+
+#define USE_NSRM35DB 1
+//#define DEBUG_EXACT_BITS
+#define MAX(a,b) (((a)>(b))?(a):(b))
+
+extern const int16_t p_input[NBUFFERS][NSAMPLES];
+extern const int8_t  p_expected[NINFERS][NCLASSES];
+
+// Used deep inside audiomark core
+char *spxGlobalHeapPtr;
+char *spxGlobalHeapEnd;
+
+int32_t ee_kws_f32(int32_t command,
+                   void  **pp_instance,
+                   void   *p_data,
+                   void   *p_params);
+
+static int16_t        aec_output[256];     // 5
+static int16_t        audio_fifo[13 * 64]; // 6
+static int8_t         mfcc_fifo[490];      // 7
+static int8_t         classes[12];         // 8
+static xdais_buffer_t xdais[4];
+
+int
+main(int argc, char *argv[])
+{
+    int           err           = 0;
+    int           new_inference = 0;
+    const int8_t *p_check       = NULL;
+    int           idx_check     = 0;
+    uint32_t      memreq        = 0;
+    uint32_t     *p_req         = &memreq;
+    void         *memory        = NULL;
+    void         *inst          = NULL;
+    uint32_t     A              = 0;
+    uint32_t     B              = 0;
+    float        ratio          = 0.0f;
+    int          i, j;
+
+    int inferences = 0;
+
+    ee_kws_f32(NODE_MEMREQ, (void **)&p_req, NULL, NULL);
+
+    printf("KWS F32 MEMREQ = %d bytes\n", memreq);
+    memory = malloc(memreq);
+    if (!memory)
+    {
+        printf("malloc() fail\n");
+        return -1;
+    }
+    inst = (void *)memory;
+    SETUP_XDAIS(xdais[0], aec_output, 512);
+    SETUP_XDAIS(xdais[1], audio_fifo, 13 * 64 * 2);
+    SETUP_XDAIS(xdais[2], mfcc_fifo, 490);
+    SETUP_XDAIS(xdais[3], classes, 12);
+
+    ee_kws_f32(NODE_RESET, (void **)&inst, NULL, NULL);
+
+    for (i = 0; i < NBUFFERS; ++i)
+    {
+        memcpy(aec_output, p_input[i], 512 /* 256 samples @ 2bytes@ */);
+        ee_kws_f32(NODE_RUN, (void **)&inst, xdais, &new_inference);
+
+        /* printf("inferences=%d, i=%d, idx_check=%d\n", inferences, i, idx_check); */
+
+        /* check both classes are noises */
+        A = B = -127;
+        p_check = p_expected[idx_check];
+        for (j = 0; j < NCLASSES; ++j)
+            {  A = MAX(A, classes[j]); /* Look for max value in the calculated result */
+               B = MAX(B, p_check[j]); /* Look for max value in the expected result */
+            }
+        if ( (A < 0)  && (B < 0)) {
+          if (new_inference) {
+            ++inferences;
+            ++idx_check;
+          }
+          continue; /* Both are less than 0, considered as noise and skip */
+        }
+        A = 0; /* sum of abs(signals) */
+        B = 0; /* sum of abs(errors) */
+
+        if (new_inference)
+        {
+            ++inferences;
+            p_check = p_expected[idx_check];
+            ++idx_check;
+
+            for (int j = 0; j < NCLASSES; ++j)
+            {
+            A += abs(128 + ((int32_t) classes[j])); /* Shift to eliminate noises */
+            B += abs(((int32_t) classes[j]) - ((int32_t)p_check[j]));
+
+#ifdef DEBUG_EXACT_BITS
+                if (classes[j] != p_check[j])
+                {
+                    err = 1;
+                    printf("buffer[%d]class[%d]: Got %d, expected %d - FAIL\n",
+                           i,
+                           j,
+                           classes[j],
+                           p_check[j]);
+                }
+#endif
+            }
+            ratio = (float)B / (float)A; /* Noise to signal ratio */
+#ifdef USE_NSRM50DB
+            if (ratio > NSRM50DB)
+            {
+                err = true;
+                printf("KWS FAIL: Inference #%d exceeded -50 dB SNR\n", i);
+            }
+#endif
+#ifdef USE_NSRM40DB
+            if (ratio > NSRM40DB)
+            {
+                err = true;
+                printf("KWS FAIL: Inference #%d exceeded -40 dB SNR\n", i);
+            }
+#endif
+#ifdef USE_NSRM35DB
+            if (ratio > NSRM35DB)
+            {
+                err = true;
+                printf("KWS FAIL: Inference #%d exceeded -35 dB SNR\n", i);
+            }
+#endif
+#ifdef USE_NSRM30DB
+            if (ratio > NSRM30DB)
+            {
+                err = true;
+                printf("KWS FAIL: Inference #%d exceeded -30 dB SNR\n", i);
+            }
+#endif
+
+        }
+    }
+
+    if (inferences == 0)
+    {
+        err = 1;
+        printf("KWS did not perform any inferences\n");
+    }
+
+    if (inferences != NINFERS)
+    {
+        err = 1;
+        printf("KWS expected %d inferences but got %d\n", NINFERS, inferences);
+    }
+
+    if (err)
+    {
+        printf("KWS test failed\n");
+        return -1;
+    }
+
+    printf("KWS test passed\n");
+    return 0;
+}

From 863a9bdd1b3f1f4bc3f0a991e10a5f90bee499be Mon Sep 17 00:00:00 2001
From: Joseph Yiu <77114984+joseph-yiu@users.noreply.github.com>
Date: Tue, 11 Jun 2024 20:56:01 +0100
Subject: [PATCH 2/7] Update score calculation info, SNR in unit tests,
 clarification of libSpeexDSP data type, other improvements

---
 README.md | 991 +++++++++++++++++++++++++++---------------------------
 1 file changed, 503 insertions(+), 488 deletions(-)
 mode change 100644 => 100755 README.md

diff --git a/README.md b/README.md
old mode 100644
new mode 100755
index b2b1c58..894ced8
--- a/README.md
+++ b/README.md
@@ -1,488 +1,503 @@
-# Introduction
-
-AudioMark™ is a benchmark which models a sophisticated, real-world audio 
-pipeline that uses a neural net for keyword spotting. EEMBC developed this 
-benchmark in response to the massive proliferation of products utilizing an 
-audio controlled Human-Machine Interface (HMI) that rely on such a pipeline. 
-This includes everything from personal assistants like Alexa, to white-box 
-appliances like washers, dryers, and refrigerators, to home entertainment 
-systems, and even cars that respond to voice commands.
-
-# Theory of operation
-
-The benchmark works by processing two microphone inputs listening to both a 
-speaker and reflected noise. A state-of-the-art adaptive beamformer determines 
-the direction of arrival of the primary speaker. This augmented signal is then 
-treated for echo cancellation and noise reduction. The cleaned signal is sent 
-to an MFCC feature extractor which feeds a neural net that spots one of ten 
-known keywords.
-
-The benchmark API facilitates hardware acceleration for key DSP and NN 
-functionality. The file `ee_api.h` describes the functions that the system 
-integrator must implement. The components were derived from several sources:
-
-* The beaformer and direction of arrival algorithms (BF+DOA) were written and tested by Arm and Infineon.
-* The acoustic echo canceller (AEC) and audio noise  reduction (ANR) elements are implemented by the SpeeX libspeexdsp library. These functions utilize the SpeeX API, which is a combination of macros and functions that perform fixed math operations, and an FFT wrapper for transformation.
-* The neural net was derived from the [Arm Model Zoo DS CNN KWS](https://github.com/ARM-software/ML-zoo/tree/master/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8).
-
-This flexibility to utilize whatever hardware is available means the benchmark 
-scales across a wide variety of MCUs and SoCs.
-
-All of the components are implemented in 32-bit floating point, with the 
-exception of the neural net, which is signed 8-bit integer. The data that flows 
-in between components is signed 16-bit integer.
-
-<img width="745" alt="image" src="https://user-images.githubusercontent.com/8249735/207705676-966fe230-8eac-4250-a468-437dc4ceebcd.png">
-
-
-
-# Building
-
-Typically this benchmark would be built within a product's specific 
-environment, using their own SDK, BSP and methodologies. Given the diversity of 
-build environments, EEMBC instead provides a simpler self-hosted `main.c` and 
-an implementation using Arm's *platform-agnostic* CMSIS functions to quickly 
-examine the functionality of the benchmark on an OS that supports `cmake`. 
-Ideally the target platform would use its own DSP and neural-net acceleration APIs.
-
-## Linux and macOS
-
-To build the self-hosted example, from the root directory type":
-
-~~~
-% mkdir build && cd build
-% cmake .. -DPORT_DIR=ports/arm
-% make test
-% make audiomark
-% ./audiomark
-~~~
-
-This will run the benchmark for at least 10 seconds and produce a score.
-
-## Windows
-
-x64 Native Tools Command Prompt for VS2019 `cmake` for windows will create a solution (`audiomark.sln`) file which 
-can be opened and compiled within Visual Studio.
-
-# Porting
-
-AudioMark has two port layers, one for the EEMBC code, and one for the SpeeX 
-DSP library. Both of these must be adjusted for the target platform, and both
-layers differ significantly. They are orthogonal, meaning the EEMBC layer is
-not above or below the SpeeX layer.
-
-The SpeeX layer contains multiple levels of abstraction:
-
-**The ISA level**: These are macros that perform simple math functions that
-could potentially be swapped for a CPU instruction, like `MULT16_32_P15`.
-These macros have generic definitions in the file `fixed_generic.h`.
-
-**The DSP library level**: These are typically more advanced functions that
-can be swapped out rather than optimizing at the ISA level. For example,
-the FFT is abstracted out entirely. See the `fftwrap.c` file for multiple
-examples of how an FFT can be instantiated.
-
-**The application level**: One step above the DSP level we have functions
-that aren't commonly found in libraries, such as `update_gains_critical_bands`.
-There are dozens of optimization options at this level which are described
-below.
-
-The EEMBC layer only focuses on the DSP-library level, with the exception
-of the neural-net intitialization and inference functions. As a result,
-there are far fewer function calls to consider with the EEMBC layer.
-
-**Word of warning**
-
-With all of these options, it is possible to accidentally (or intentionally)
-create a port that runs faster at the expense of quality, thus skewing
-comparisons. However, for a score to be considered valid, it must pass the
-unit tests. These tests permit at most 50 dB of SNR, a failure of a unit
-test means the optimizations have gone too far to be considered a fair
-comparison.
-
-## EEMBC port layer
-
-The EEMBC port layer is contained in two files: `th_types.h` and `th_api.c`.
-An unimplemented empty set of files is provided in `ports/barebones`, and a 
-reference can be found in `ports/arm_cmsis`.
-
-The `th_types.h` file defines the floating-point type, as well as 2D matrix 
-object type, and both real and complex FFT object types.
- 
-The `th_api.c` file contains the definition of the following functions:
-
-### Standard library overrides
-
-* th_malloc
-* th_free
-* th_memcpy
-* th_memmove
-* th_memset
-
-The memory allocation and free functions depend on the memory types of the
-platform, and how the system integrator wishes to place data in memory. See
-the section on the *Memory Model* in this document for more details. The top
-of the `th_api.c` file defines all of the static memory buffers required.
-
-### FFT functions
-
-* th_cfft_init_f32
-* th_cfft_f32
-* th_rfft_init_f32
-* th_rfft_f32
-
-These functions initialize an FFT type variable and perform complex and real
-FFTs.
-
-### Fundamental math functions
-
-* th_absmax_f32
-* th_int16_to_f32
-* th_f32_to_int16
-* th_cmplx_mult_cmplx_f32
-* th_cmplx_conj_f32
-* th_cmplx_dot_prod_f32
-* th_cmplx_mag_f32
-* th_add_f32
-* th_subtract_f32
-* th_multiply_f32
-* th_dot_prod_f32
-* th_offset_f32
-* th_vlog_f32
-* th_mat_vec_mult_f32
-
-Throughtout the EEMBC code, these functions perform the heavy-lifting.
-
-### Neural-net functions
-
-* th_nn_init
-* th_nn_classify
-
-The neural-net functions are a bit different than the DSP functions. Where a
-math function like `add` is straightforward, the initialization and invocation
-of the inference is extremely hardware dependent.
-
-To provide maximum flexibility, the neural net topology, layers, weights, and
-biases are all frozen and cannot be modified. Instead, the system integrator
-must construct the neural net from the definitions in `src/ee_nn.h` and
-`src/ee_nn_tables.c`.
-
-## LibSpeexDSP optimizations
-
-The AEC and ANR AudioMark components which are part of the LibSpeexDSP, can be 
-enhanced with architecture specific routines, taking advantage of the 
-underlying CPU capabilities. There are three types of optimizations available:
-
-1. FFT - FFT can already be replaced with optimized variants thanks to the 
-existing FFT wrapper (`lib/speexdsp/libspeexdsp/fftwrap.c`) and some parts of 
-the resampler already have SIMD support for ARM Neon and Intel SSE. Additional 
-FFT operations can be added here.
-
-2. Intrinsic primitives - These are simple macros that come with several 
-variants (here is the generic implementation of a multiplication: 
-[`MULT16_16`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/fixed_generic.h#L77)
-and may be replaced as needed. SpeeX comes with several compiler-time selected 
-options stored in the files `fixed_*.c`.
-
-3. Override path - This method uses define macros to override individual 
-functions and is described below.
-
-For the rest of the software, and given the monolithic nature of the 
-LibSpeexDSP structure, customization of intensive parts could be achieved by 
-defining compiler conditions that would override some of the inner loops with 
-optimized routines that could potentially be vectorized or 
-hardware-accelerated. This is similar to what was implemented for the TriMedia 
-porting (referring the `lib/speexdsp/libspeexdsp/tmv` folder)
-
-By default, none of these override compiler directives are defined, causing the 
-LibSpeexDSP library to act with its vanilla behaviour. However, the system 
-integrator is free to define all or part of these compiler conditional options. 
-The decision on which of these should be set active is largely architecture and 
-compiler dependant. Simple loop structures are typically properly handled by 
-recent compilers which support vectorization for SIMD targets, but more complex 
-ones could require access to an external library like CMSIS DSP, hand 
-optimization through C with intrinsic or even assembly to reach peak 
-performance.
-
-As a first example, the AEC power_spectrum routine, which is essentially 
-computing the squared magnitude of a complex signal, could use the CMSIS DSP 
-`arm_cmplx_mag_squared_f32` function and for this defining the 
-`OVERRIDE_MDF_POWER_SPECTRUM` would deactivate original definition and use the 
-optimized variant that will be placed in the 
-lib/speexdsp/libspeexdsp/mdf_opt_helium.c and defined the following way:
-
-Here is the generic example in [`mdf_opt_generic.c`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/mdf_opt_generic.c#L84-L94):
-
-```C
-static void power_spectrum(const spx_word16_t * X, spx_word32_t * ps, int N)
-{
-    int             i, j;
-    ps[0] = MULT16_16(X[0], X[0]);
-    for (i = 1, j = 1; i < N - 1; i += 2, j++) {
-        ps[j] = MULT16_16(X[i], X[i]) + MULT16_16(X[i + 1], X[i + 1]);
-    }
-    ps[j] = MULT16_16(X[i], X[i]);
-}
-```
-
-While it is possible to gain performance improvement by overriding [`MULT16_16`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/fixed_generic.h#L77), a better optimization could be
-attained by overriding the function. Here is an example from [`mdf_opt_helium.c`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/preprocess_opt_helium.c#L172-L180):
-
-```C
-void power_spectrum(const spx_word16_t * X, spx_word32_t * ps, int N)
-{
-    ps[0] = MULT16_16(X[0], X[0]);
-    arm_cmplx_mag_squared_f32(&X[1], ps + 1, N - 1);
-}
-```
-
-For this one, most of the recent compilers will be able to vectorize the native implementation. Visual inspection
-of the generated assembly would determine whether it is worth having a specific optimized variant.
-
-A second example would be the ANR final stage overlap-add with 16-bit integer conversion. For this one, there is no native library equivalent.
-A customized variant with SIMD C intrinsic will allow to take advantage of SIMD if available.
-
-As before, here is the generic implementation from [`preprocess_opt_generic.c`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/preprocess_opt_generic.c#L64-L73):
-
-```C
-static void vect_ola(const spx_word16_t * pSrcA, const spx_word16_t * pSrcB, spx_int16_t * pDst, uint32_t blockSize)
-{
-    int             i;
-    
-    for (i = 0; i < blockSize; i++)
-        pDst[i] = WORD2INT(ADD32(EXTEND32(pSrcA[i]), EXTEND32(pSrcB[i])));
-}
-```
-
-And here would be the ARM with Helium intrinsics version proposal found in [`preprocess_opt_helium.c`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/preprocess_opt_helium.c#L110-L127):
-
-```C
-void vect_ola(const spx_word16_t * pSrcA, const spx_word16_t * pSrcB, spx_int16_t * pDst, uint32_t blockSize)
-{
-    int16x8_t converted = vdupq_n_s16(0);
-    
-    for (int i = 0; i < blockSize; i += 4) {
-        float32x4_t vtmp = vaddq(vld1q(pSrcA), vld1q(pSrcB));
-        /* rounding, saturation and 16-bit narrowing with saturation */
-        converted = vqmovnbq(vuninitializedq(converted), vcvtaq_s32_f32(vtmp));
-        vstrhq_s32(pDst, converted);
-        pDst += 4;         
-        pSrcA += 4;         
-        pSrcB += 4;
-    }
-}
-```
-
-All override compiler defines and associated C routines prototypes will be 
-listed below. These are divided into 3 categories, associated to:
-
-* Echo canceller core (`lib/speexdsp/libspeexdsp/mdf.c`),
-* Noise suppressor core (`lib/speexdsp/libspeexdsp/preprocess.c`)
-* Associated filter banks, which are noise suppressor subparts for 
-psychoacoustic analysis (`lib/speexdsp/libspeexdsp/filterbank.c`) 
-
-Code behaviour for these different C routines is respectively provided in:
-
-* `lib/speexdsp/libspeexdsp/mfd_opt_generic.c`
-* `lib/speexdsp/libspeexdsp/preprocess_opt_generic.c`
-* `lib/speexdsp/libspeexdsp/filterbank_opt_generic.c`
-
-These are provided as models, replicated as-is from the core LibSpeexDSP 
-software parts with function embedding and must serve as reference for 
-optimized software equivalent.
-
-It is expected to pass these compiler directives inside build systems like 
-cmake or configuration header as with the standard LibSpeexDSP `config.h`.
-
-An example of use can be found in the ARM CMSIS build YML files 
-(`platform/cmsis/speex.clayer.yml`) where most of these override defines have 
-been activated.
-
-```yaml
-      misc:
-        - C:
-          - -DFLOATING_POINT
-          - -DEXPORT=
-          - -DOS_SUPPORT_CUSTOM
-          # speex boosted routines
-          # FilterBank
-          - -DOVERRIDE_FB_COMPUTE_BANK32
-          - -DOVERRIDE_FB_COMPUTE_PSD16
-          # ANR
-          - -DOVERRIDE_ANR_VEC_MUL
-          - -DOVERRIDE_ANR_UPDATE_NOISE_ESTIMATE
-          ...
-```
-
-### Complete list of overrides, functions, and behavior.
-
-| Override | Function | Beahvior |
-| -------- | -------- | ------------------------ |
-| OVERRIDE_MDF_ADJUST_PROP | mdf_adjust_prop | Computes filter adaptation rate, proportional to inverse of weight filter energy. |
-| OVERRIDE_MDF_CONVERG_LEARN_RATE_CALC | mdf_non_adapt_learning_rate_calc | Part of the process of the computing the adaption rate when filter is not yet adapted enough. This routine divides the adaptation rate by Far End power over the whole subframe. |
-| OVERRIDE_MDF_DC_NOTCH | filter_dc_notch16 | Notch filter with strided spx_int16_t (int16_t) type input and spx_word16_t (floating point) output. |
-| OVERRIDE_MDF_DEEMPH | mdf_deemph | Compute error signal, check input saturation and convert / saturate strided output to spx_int16_t (int16_t). |
-| OVERRIDE_MDF_FILTERED_SPEC_AD_XCORR | void filtered_spectra_cross_corr | Compute filtered spectra and (cross-)correlations. |
-| OVERRIDE_MDF_INNER_PROD | mdf_inner_prod | Dot-product. Was already provided as a stand-alone function in the original software. |
-| OVERRIDE_MDF_NORM_LEARN_RATE_CALC | mdf_nominal_learning_rate_calc | Normal learning rate calculation once we're past the minimal adaptation phase. |
-| OVERRIDE_MDF_POWER_SPECTRUM | power_spectrum | Compute power spectrum of a complex vector. |
-| OVERRIDE_MDF_POWER_SPECTRUM_ACCUM | power_spectrum_accum | Same as power_spectrum above, with extra accumulation. |
-| OVERRIDE_MDF_PREEMPH_FLT | mdf_preemph | Copy spx_word16_(int16_t) input data to buffer and apply pre-emphasis filter. |
-| OVERRIDE_MDF_SMOOTHED_ADD | smoothed_add | Blend error and echo residual to apply a smooth transition to avoid introducing blocking artifacts. |
-| OVERRIDE_MDF_SMOOTH_FE_NRG | smooth_fe_nrg | Smooth far end energy estimate over time. |
-| OVERRIDE_MDF_SPECTRAL_MUL_ACCUM | spectral_mul_accum | Compute cross-power spectrum of a complex vectors and accumulate. Only relevant for fixed-point as mixes spx_word16_t and spx_word32_t. Floating point version, used in Audiomark, has only plain floating point  versions and does not distinguish with spectral_mul_accum16. |
-| OVERRIDE_MDF_SPECTRAL_MUL_ACCUM16 | spectral_mul_accum16 | Same as spectral_mul_accum above but plain spx_word16_t format. Floating point version, used in Audiomark, has only plain floating point versions and does not distinguish with spectral_mul_accum. |
-| OVERRIDE_MDF_STRIDED_PREEMPH_FLT | mdf_preemph_with_stride_int | Strided spx_int16_t (int16_t) pre-emphasis filter with saturation check. |
-| OVERRIDE_MDF_VEC_ADD | vect_add | spx_word16_t (Floating point) vector addition. |
-| OVERRIDE_MDF_VEC_CLEAR | vect_clear | spx_word16_t (Floating point) vector clear. |
-| OVERRIDE_MDF_VEC_COPY | vect_copy | spx_word16_t (Floating point) vector copy. |
-| OVERRIDE_MDF_VEC_MULT | vect_mult | spx_word16_t (Floating point) vector multiplication for windowing. |
-| OVERRIDE_MDF_VEC_SCALE | vect_scale | spx_word16_t (Floating point) vector scaling. |
-| OVERRIDE_MDF_VEC_SUB | vect_sub | spx_word16_t (Floating point) vector subtraction. |
-| OVERRIDE_MDF_VEC_SUB_INT16 | vect_sub16 | spx_int16_t (16-bit signed integer point) vector subtraction for filtered echo computation, difference of AEC input and output subframes. |
-| OVERRIDE_MDF_WEIGHT_SPECT_MUL_CONJ | weighted_spectral_mul_conj | Compute weighted cross-power spectrum of a complex vector with conjugate. |
-| OVERRIDE_ANR_APOSTERIORI_SNR | aposteriori_snr | Compute A-posteriori / A-priori SNRs. |
-| OVERRIDE_ANR_APPLY_SPEC_GAIN | apply_spectral_gain | Apply computed spectral gain. |
-| OVERRIDE_ANR_COMPUTE_GAIN_FLOOR | compute_gain_floor | Compute the gain floor based on different floors for the background noise and residual echo. |
-| OVERRIDE_ANR_HYPERGEOM_GAIN | hypergeom_gain | compute hypergeometric function. |
-| OVERRIDE_ANR_OLA | vect_ola | spx_word16_t vector overlap and add. |
-| OVERRIDE_ANR_POWER_SPECTRUM | power_spectrum | Complex magnitude squared of a spx_word16_t (floating-point) vector. |
-| OVERRIDE_ANR_QCURVE | qcurve | Compute 1 / (1 + 0.15 / (SNR_SCALING_1 * x)) |
-| OVERRIDE_ANR_UPDATE_GAINS_CRITICAL_BANDS | update_gains_critical_bands | Update gains in critical bands (MEL scale). |
-| OVERRIDE_ANR_UPDATE_GAINS_LINEAR | update_gains_linear | Update gains in linear spectral bands. |
-| OVERRIDE_ANR_UPDATE_NOISE_ESTIMATE | update_noise_estimate | Update noise estimates. |
-| OVERRIDE_ANR_UPDATE_NOISE_PROB | update_noise_prob | Update noise probabilities and smoothed power spectrum. |
-| OVERRIDE_ANR_UPDATE_ZETA | preprocess_update_zeta | Update Smoothed a priori SNR. |
-| OVERRIDE_ANR_VEC_CONV_FROM_INT16 | vect_conv_from_int16 | Convert spx_int16_t (16-bit signed integer) vector to spx_word16_t (floating-point). |
-| OVERRIDE_ANR_VEC_COPY | vect_copy* | Generic vector copy. |
-| OVERRIDE_ANR_VEC_MUL | vect_mult* | `spx_word16_t` vector multiplication for windowing. |
-| OVERRIDE_FB_COMPUTE_BANK32 | filterbank_compute_bank32 | Convert linear power spectrum in MEL perceptual scale. |
-| OVERRIDE_FB_COMPUTE_PSD16 | filterbank_compute_psd16 | Compute the linear power spectral density from MEL perceptual scale. |
-
-\* It can be noted that ANR vector multiplications and vector copy routines are similar to AEC ones and can be shared.
-
-# Memory model
-
-There are four types of memory required for the benchmark: input audio buffers,
-pre-defined inter-component buffers, constant tables, and component-specific
-scractch memory requests.
-
-## Input audio buffers
-
-Three channels of input audio are provided: left, right, and noise. There are
-roughly 1.69 seconds of audio in these buffers. A fourth buffer, `for_asr` is
-used for propagate the AEC output.
-
-## Inter-component buffers
-
-Each component connects to the other components or inputs via one or more
-buffers. These statically allocated buffers' storage is defined in `th_api.c`
-to allow the system integrator to better control placement.
-
-## Table constants
-
-All files with the name `*_tables.c` define arrays that are referenced via
-extern from their respective components. These array variables have been
-stored in their own source files to facilitate linker placement optimization.
-
-The adaptive beamformer, MFCC feature extractor, and neural net all have
-several large tables of constants.
-
-## Dynamic allocation for scratch memory
-
-Each component needs a certain amount of scratch memory. The components are
-written in such a way that they are first queried to indicate how much
-memory they require, and then that is allocated by the framework and provided
-via buffers during reset and run. This has been abstracted down to the 
-`th_malloc` function. The parameters to this function are the number of
-required bytes and the component that is requesting it. The system integrator
-can use the default STDLIB `malloc` function, or allocate their own memory
-buffers to target different types of memory. **The memory is never freed so
-there is no need to install a sophisticated memory manager.** Simply assigning
-subsequent address pointers is sufficient (provided there is enough memory).
-
-Both the LibSpeexDSP and EEMBC-provided components utilize this dynamic
-allocation pattern.
-
-# Coding conventions
-
-## Formatting
-
-EEMBC formats according to [Barr-C Embedded Standards](https://barrgroup.com/sites/default/files/barr_c_coding_standard_2018.pdf). The `.clang-format` file
-in the root directory observes this. This file can be used within VSCode or
-MSVC, however it isn't clear if this behaves the same as `clang-format`
-version 14 (which aligns pointer stars differently).
-
-## Functions and filenames
-
-Traditionally, all functions and files start with either `ee_` or `th_`, where
-the former notation indicates "thou shall not change" and the latter must be
-changed in order to port the code (i.e., the Test Harness, hence `th_`).
-
-Since there is so much code from Xiph that we are including, we will not change
-all of their code but might have to change some (like FFT wrappers). It may
-require a Run Rule to avoid this code being altered. Ideally every function
-should fall into a simple `th_api` folder or collection of files so that it is
-obvious what needs to be ported. Currently the `components/eembc` folder
-illustrates this by separating all of the Arm-specific code into `th_api.c`.
-
-# Run rules
-
-1. The minimum resolution of the system timer used for performance measurement shall be one (1) kHz.
-
-2. The minimum runtime shall be 10 seconds iterating on the `audiomark_run()` function. At least 10 iterations must complete in this 10 seconds, otherwise the runtime shall be whatever achieves 10 iterations. 10 seconds of runtime with a 1 ms resolution is effectively 0.01% measurement resolution.
-
-3. Only functions and files starting with `th_*` may be altered. In the case of the `th_api`, this is required. The one exception to this is the SpeeX DSP library code, where the user may modify the `wrapfft.c` to install the optimal FFT, change the override macros and provide their own associated functionality, or modify the primitive DSP math macros. All of the EEMBC provided `ee_` functions ultimately perform DSP computation through a subset of functions declared in the `ee_api.h` header, and implemented however the system integrator choses (the example puts all of the implementation reference code in the `th_api.c` file).
-
-4. For a score to be considered valid, it must pass the AEC, ANR, ABF, and KWS regression tests found in the `tests/` directory. The AEC, ANR, and ABF tests utilize a SNR ratio check of -50 dB over 62.5 ms frames of data. The KWS expects the softmax output to match the Top-1 prediction for each inference, and not the actual probability. This allows for flexibility in optimizing the API functions which may not be bit-exact, but still achieve roughly the same fidelity.
-
-5. All processing must be carried out on the platform locally and not sent to the cloud. The definition of a device includes single chip silicon, and multi-die modules, and multi-chip platforms. External memories are allowed.
-
-6. The AudioMark score shall be computed as "iterations per second * 1000 * 1.5". AudioMark/MHz is simply AudioMark divided by the highest core frequency in MHz. For example, if the benchmark is running on Core A at 100 Mhz and Core A uses a DSP peripheral running at 300 MHz, the highest frequency is used in the computation; in this case, 300 MHz. See footnote #1 below.
-
-
-7. The benchmark score shall be obtained with the serial computation of the following components--ABF, AEC, ANR, KWS--with execution proceeding from one component to another, on completion. The benchmark shall not be altered or implemented in any way that causes any of the components to execute in parallel.
-
-8. Running multiple processes of AudioMark on a platform is allowed. For example, a dual core product could run one instance of AudioMark (`time_audiomark_run()`) on each core  For N cores running the AudioMark concurrently:
-
-```
-    Define: AM(n) is defined in Rule #6. measured per core n
-    Define: f(n) is the clock frequency of core n
-    AudioMark (AM) = Sum {AM(n): n = 1 to N}
-    AudioMark/MHz (AM/MHz) = AM divided by max {f(n): n = 1 to N}
-```
-
-**Footnote #1: Explanation of scoring equation**
-
-First, the 1000 factor is introduced to scale the score into a preferred integer range, this is a common EEMBC technique to avoid comparing small fractional numbers. Second, notice that the benchmark assumes a pipeline operating on 16 kHz audio input, and a platform that is operating efficiently would score 1.0 (with no scalar): it is running exactly at the speed needed, no faster, no slower. A platform with half the performance would measure 0.5 iterations per second. However, the benchmark input dataset is actually 24000 samples worth of data not 16000, so if one iteration completes in one second, the benchmark has in reality performed better than the 16 kHz design goal. To adjust for this, the score is multiplied by 1.5. (Note: The benchmark could reduce the number of samples to one second (or 16k samples), however, the lead-in silence is needed to stabilize the ANR, and the keyword utterance spills over the one-second mark of the sample.)
-
-# Submitting scores
-
-1. A score must be submitted to the website before it can be used in any external publication such as: academic journals, technical papers, and marketing assets. An unsubmitted score may only be discussed internally or with a 3rd party under NDA, but not presented to the public in any way. Note: the score may be submitted but not go live (i.e., visible on the web) until a certain date agreed to between the submitter and EEMBC. This is to account for product launch schedules.
-
-2. A submitted score shall be from hardware that is available for purchase to any member of the general public. Scores for hardware that is yet to be announced must be marked "preliminary". This score may be superseded by a new score on the released product, or cleared by request to EEMBC after the product is launched.
-
-3. A score collected from simulation cannot be submitted. The measured score must come from actual silicon. This includes CPU, MCU, MPU, SoC, and FPGA prototype.
-
-# Credits
-
-This benchmark would not have been possible without the commitment and contributions of the working group members, and the assistance from various domain experts, including (sorted by given name):
-
-* Ashutosh Pandey, Infineon (Technical Lead)
-* Dmitry Utyansky, Synopsys
-* Fabien Klein, Arm
-* Felix Johnny Thomasmathibalan, Arm
-* Gary Jacobson: Renesas
-* Jim Ryan, onsemi
-* Joseph Yiu, Arm
-* Kaiping Lee, Infineon
-* Laurent Le Faucheur, Arm
-* Mark Wallis, STMicroelectronics
-* Nagendra (GD) Gulur Dwarakanath, Texas Instruments
-* Peter Torelli, EEMBC
-* Rita Chattopadhyay, Intel
-* Ruud Derwig, Synopsys
-
-# Copyright and license
-
-Copyright (C) EEMBC, All rights reserved. Please refer to LICENSE.md.
+# Introduction
+
+AudioMark™ is a benchmark which models a sophisticated, real-world audio 
+pipeline that uses a neural net for keyword spotting. EEMBC developed this 
+benchmark in response to the massive proliferation of products utilizing an 
+audio controlled Human-Machine Interface (HMI) that rely on such a pipeline. 
+This includes everything from personal assistants like Alexa, to white-box 
+appliances like washers, dryers, and refrigerators, to home entertainment 
+systems, and even cars that respond to voice commands.
+
+# Theory of operation
+
+The benchmark works by processing two microphone inputs listening to both a 
+speaker and reflected noise. A state-of-the-art adaptive beamformer determines 
+the direction of arrival of the primary speaker. This augmented signal is then 
+treated for echo cancellation and noise reduction. The cleaned signal is sent 
+to an MFCC feature extractor which feeds a neural net that spots one of ten 
+known keywords.
+
+The benchmark API facilitates hardware acceleration for key DSP and NN 
+functionality. The file `ee_api.h` describes the functions that the system 
+integrator must implement. The components were derived from several sources:
+
+* The beaformer and direction of arrival algorithms (BF+DOA) were written and tested by Arm and Infineon.
+* The acoustic echo canceller (AEC) and audio noise  reduction (ANR) elements are implemented by the SpeeX libspeexdsp library. These functions utilize the SpeeX API, which is a combination of macros and functions that perform floating-point/fixed math operations, and an FFT wrapper for transformation. In AudioMark, only the single precision floating-point version of the library is used.
+* The neural net was derived from the [Arm Model Zoo DS CNN KWS](https://github.com/ARM-software/ML-zoo/tree/master/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8).
+
+This flexibility to utilize whatever hardware is available means the benchmark 
+scales across a wide variety of MCUs and SoCs.
+
+When possible, the components are implemented in 32-bit floating point, with the 
+exception of the neural net, which is signed 8-bit integer. The data that flows 
+in between components is signed 16-bit integer.
+
+<img width="745" alt="image" src="https://user-images.githubusercontent.com/8249735/207705676-966fe230-8eac-4250-a468-437dc4ceebcd.png">
+
+
+
+# Building
+
+Typically this benchmark would be built within a product's specific 
+environment, using their own SDK, BSP and methodologies. Given the diversity of 
+build environments, EEMBC instead provides a simpler self-hosted `main.c` and 
+an implementation using Arm's *platform-agnostic* CMSIS functions to quickly 
+examine the functionality of the benchmark on an OS that supports `cmake`. 
+Ideally the target platform would use its own DSP and neural-net acceleration APIs.
+
+## Linux and macOS
+
+To build the self-hosted example, from the root directory type":
+
+~~~
+% mkdir build && cd build
+% cmake .. -DPORT_DIR=ports/arm
+% make test
+% make audiomark
+% ./audiomark
+~~~
+
+This will run the benchmark for at least 10 seconds and produce a score.
+
+## Windows
+
+x64 Native Tools Command Prompt for VS2019 `cmake` for windows will create a solution (`audiomark.sln`) file which 
+can be opened and compiled within Visual Studio.
+
+# Porting
+
+AudioMark has two port layers, one for the EEMBC code, and one for the SpeeX 
+DSP library. Both of these must be adjusted for the target platform, and both
+layers differ significantly. They are orthogonal, meaning the EEMBC layer is
+not above or below the SpeeX layer.
+
+The SpeeX layer contains multiple levels of abstraction:
+
+**The ISA level**: These are macros that perform simple math functions that
+could potentially be swapped for a CPU instruction, like `MULT16_32_P15`.
+These macros have generic definitions in the file `fixed_generic.h`.
+
+**The DSP library level**: These are typically more advanced functions that
+can be swapped out rather than optimizing at the ISA level. For example,
+the FFT is abstracted out entirely. See the `fftwrap.c` file for multiple
+examples of how an FFT can be instantiated.
+
+**The application level**: One step above the DSP level we have functions
+that aren't commonly found in libraries, such as `update_gains_critical_bands`.
+There are dozens of optimization options at this level which are described
+below.
+
+The EEMBC layer only focuses on the DSP-library level, with the exception
+of the neural-net intitialization and inference functions. As a result,
+there are far fewer function calls to consider with the EEMBC layer.
+
+**Word of warning**
+
+With all of these options, it is possible to accidentally (or intentionally)
+create a port that runs faster at the expense of quality, thus skewing
+comparisons. However, for a score to be considered valid, it must pass the 
+unit tests. There are five unit tests in total:
+
+- Digital Signal Processing (DSP) tests
+  - test_abf (tests/test_abf_f32.c)
+  - test_anr (tests/test_anr_f32.c)
+  - test_aec (tests/test_aec_f32.c)
+  - test_mfcc (tests/test_mfcc_f32.c)
+- Neural-network (NN) test
+  - test_kws (tests/test_kws.c)  
+
+The DSP unit tests permit at most 50 dB of SNR (Signal-to-Noise 
+ratio), a failure of a unit test means the optimizations have gone too far 
+to be considered a fair comparison. The KWS unit test use a 35dB ratio for
+the comparison but only when a valid data is present.
+
+Note: The actual test codes use Noise-to-Signal ratio instead of 
+Signal-to-Noise. SNR has the disadvantage that its value becoming infinity when the results 
+are bit-exact match to reference data (i.e. noise level is 0).
+
+## EEMBC port layer
+
+The EEMBC port layer is contained in two files: `th_types.h` and `th_api.c`.
+An unimplemented empty set of files is provided in `ports/barebones`, and a 
+reference can be found in `ports/arm_cmsis`.
+
+The `th_types.h` file defines the floating-point type, as well as 2D matrix 
+object type, and both real and complex FFT object types.
+ 
+The `th_api.c` file contains the definition of the following functions:
+
+### Standard library overrides
+
+* th_malloc
+* th_free
+* th_memcpy
+* th_memmove
+* th_memset
+
+The memory allocation and free functions depend on the memory types of the
+platform, and how the system integrator wishes to place data in memory. See
+the section on the *Memory Model* in this document for more details. The top
+of the `th_api.c` file defines all of the static memory buffers required.
+
+### FFT functions
+
+* th_cfft_init_f32
+* th_cfft_f32
+* th_rfft_init_f32
+* th_rfft_f32
+
+These functions initialize an FFT type variable and perform complex and real
+FFTs.
+
+### Fundamental math functions
+
+* th_absmax_f32
+* th_int16_to_f32
+* th_f32_to_int16
+* th_cmplx_mult_cmplx_f32
+* th_cmplx_conj_f32
+* th_cmplx_dot_prod_f32
+* th_cmplx_mag_f32
+* th_add_f32
+* th_subtract_f32
+* th_multiply_f32
+* th_dot_prod_f32
+* th_offset_f32
+* th_vlog_f32
+* th_mat_vec_mult_f32
+
+Throughtout the EEMBC code, these functions perform the heavy-lifting.
+
+### Neural-net functions
+
+* th_nn_init
+* th_nn_classify
+
+The neural-net functions are a bit different than the DSP functions. Where a
+math function like `add` is straightforward, the initialization and invocation
+of the inference is extremely hardware dependent.
+
+To provide maximum flexibility, the neural net topology, layers, weights, and
+biases are all frozen and cannot be modified. Instead, the system integrator
+must construct the neural net from the definitions in `src/ee_nn.h` and
+`src/ee_nn_tables.c`.
+
+## LibSpeexDSP optimizations
+
+The AEC and ANR AudioMark components which are part of the LibSpeexDSP, can be 
+enhanced with architecture specific routines, taking advantage of the 
+underlying CPU capabilities. There are three types of optimizations available:
+
+1. FFT - FFT can already be replaced with optimized variants thanks to the 
+existing FFT wrapper (`lib/speexdsp/libspeexdsp/fftwrap.c`) and some parts of 
+the resampler already have SIMD support for ARM Neon and Intel SSE. Additional 
+FFT operations can be added here.
+
+2. Intrinsic primitives - These are simple macros that come with several 
+variants (here is the generic implementation of a multiplication: 
+[`MULT16_16`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/fixed_generic.h#L77)
+and may be replaced as needed. SpeeX comes with several compiler-time selected 
+options stored in the files `fixed_*.c`.
+
+3. Override path - This method uses define macros to override individual 
+functions and is described below.
+
+For the rest of the software, and given the monolithic nature of the 
+LibSpeexDSP structure, customization of intensive parts could be achieved by 
+defining compiler conditions that would override some of the inner loops with 
+optimized routines that could potentially be vectorized or 
+hardware-accelerated. This is similar to what was implemented for the TriMedia 
+porting (referring the `lib/speexdsp/libspeexdsp/tmv` folder)
+
+By default, none of these override compiler directives are defined, causing the 
+LibSpeexDSP library to act with its vanilla behaviour. However, the system 
+integrator is free to define all or part of these compiler conditional options. 
+The decision on which of these should be set active is largely architecture and 
+compiler dependant. Simple loop structures are typically properly handled by 
+recent compilers which support vectorization for SIMD targets, but more complex 
+ones could require access to an external library like CMSIS DSP, hand 
+optimization through C with intrinsic or even assembly to reach peak 
+performance.
+
+As a first example, the AEC power_spectrum routine, which is essentially 
+computing the squared magnitude of a complex signal, could use the CMSIS DSP 
+`arm_cmplx_mag_squared_f32` function and for this defining the 
+`OVERRIDE_MDF_POWER_SPECTRUM` would deactivate original definition and use the 
+optimized variant that will be placed in the 
+lib/speexdsp/libspeexdsp/mdf_opt_helium.c and defined the following way:
+
+Here is the generic example in [`mdf_opt_generic.c`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/mdf_opt_generic.c#L84-L94):
+
+```C
+static void power_spectrum(const spx_word16_t * X, spx_word32_t * ps, int N)
+{
+    int             i, j;
+    ps[0] = MULT16_16(X[0], X[0]);
+    for (i = 1, j = 1; i < N - 1; i += 2, j++) {
+        ps[j] = MULT16_16(X[i], X[i]) + MULT16_16(X[i + 1], X[i + 1]);
+    }
+    ps[j] = MULT16_16(X[i], X[i]);
+}
+```
+
+While it is possible to gain performance improvement by overriding [`MULT16_16`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/fixed_generic.h#L77), a better optimization could be
+attained by overriding the function. Here is an example from [`mdf_opt_helium.c`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/preprocess_opt_helium.c#L172-L180):
+
+```C
+void power_spectrum(const spx_word16_t * X, spx_word32_t * ps, int N)
+{
+    ps[0] = MULT16_16(X[0], X[0]);
+    arm_cmplx_mag_squared_f32(&X[1], ps + 1, N - 1);
+}
+```
+
+For this one, most of the recent compilers will be able to vectorize the native implementation. Visual inspection
+of the generated assembly would determine whether it is worth having a specific optimized variant.
+
+A second example would be the ANR final stage overlap-add with 16-bit integer conversion. For this one, there is no native library equivalent.
+A customized variant with SIMD C intrinsic will allow to take advantage of SIMD if available.
+
+As before, here is the generic implementation from [`preprocess_opt_generic.c`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/preprocess_opt_generic.c#L64-L73):
+
+```C
+static void vect_ola(const spx_word16_t * pSrcA, const spx_word16_t * pSrcB, spx_int16_t * pDst, uint32_t blockSize)
+{
+    int             i;
+    
+    for (i = 0; i < blockSize; i++)
+        pDst[i] = WORD2INT(ADD32(EXTEND32(pSrcA[i]), EXTEND32(pSrcB[i])));
+}
+```
+
+And here would be the ARM with Helium intrinsics version proposal found in [`preprocess_opt_helium.c`](https://github.com/eembc/audiomark-dev/blob/e0fd95e10d5ce6fd724b525fac998327b4f0dd8f/lib/speexdsp/libspeexdsp/preprocess_opt_helium.c#L110-L127):
+
+```C
+void vect_ola(const spx_word16_t * pSrcA, const spx_word16_t * pSrcB, spx_int16_t * pDst, uint32_t blockSize)
+{
+    int16x8_t converted = vdupq_n_s16(0);
+    
+    for (int i = 0; i < blockSize; i += 4) {
+        float32x4_t vtmp = vaddq(vld1q(pSrcA), vld1q(pSrcB));
+        /* rounding, saturation and 16-bit narrowing with saturation */
+        converted = vqmovnbq(vuninitializedq(converted), vcvtaq_s32_f32(vtmp));
+        vstrhq_s32(pDst, converted);
+        pDst += 4;         
+        pSrcA += 4;         
+        pSrcB += 4;
+    }
+}
+```
+
+All override compiler defines and associated C routines prototypes will be 
+listed below. These are divided into 3 categories, associated to:
+
+* Echo canceller core (`lib/speexdsp/libspeexdsp/mdf.c`),
+* Noise suppressor core (`lib/speexdsp/libspeexdsp/preprocess.c`)
+* Associated filter banks, which are noise suppressor subparts for 
+psychoacoustic analysis (`lib/speexdsp/libspeexdsp/filterbank.c`) 
+
+Code behaviour for these different C routines is respectively provided in:
+
+* `lib/speexdsp/libspeexdsp/mfd_opt_generic.c`
+* `lib/speexdsp/libspeexdsp/preprocess_opt_generic.c`
+* `lib/speexdsp/libspeexdsp/filterbank_opt_generic.c`
+
+These are provided as models, replicated as-is from the core LibSpeexDSP 
+software parts with function embedding and must serve as reference for 
+optimized software equivalent.
+
+It is expected to pass these compiler directives inside build systems like 
+cmake or configuration header as with the standard LibSpeexDSP `config.h`.
+
+An example of use can be found in the ARM CMSIS build YML files 
+(`platform/cmsis/speex.clayer.yml`) where most of these override defines have 
+been activated.
+
+```yaml
+      misc:
+        - C:
+          - -DFLOATING_POINT
+          - -DEXPORT=
+          - -DOS_SUPPORT_CUSTOM
+          # speex boosted routines
+          # FilterBank
+          - -DOVERRIDE_FB_COMPUTE_BANK32
+          - -DOVERRIDE_FB_COMPUTE_PSD16
+          # ANR
+          - -DOVERRIDE_ANR_VEC_MUL
+          - -DOVERRIDE_ANR_UPDATE_NOISE_ESTIMATE
+          ...
+```
+
+### Complete list of overrides, functions, and behavior.
+
+| Override | Function | Beahvior |
+| -------- | -------- | ------------------------ |
+| OVERRIDE_MDF_ADJUST_PROP | mdf_adjust_prop | Computes filter adaptation rate, proportional to inverse of weight filter energy. |
+| OVERRIDE_MDF_CONVERG_LEARN_RATE_CALC | mdf_non_adapt_learning_rate_calc | Part of the process of the computing the adaption rate when filter is not yet adapted enough. This routine divides the adaptation rate by Far End power over the whole subframe. |
+| OVERRIDE_MDF_DC_NOTCH | filter_dc_notch16 | Notch filter with strided spx_int16_t (int16_t) type input and spx_word16_t (floating point) output. |
+| OVERRIDE_MDF_DEEMPH | mdf_deemph | Compute error signal, check input saturation and convert / saturate strided output to spx_int16_t (int16_t). |
+| OVERRIDE_MDF_FILTERED_SPEC_AD_XCORR | void filtered_spectra_cross_corr | Compute filtered spectra and (cross-)correlations. |
+| OVERRIDE_MDF_INNER_PROD | mdf_inner_prod | Dot-product. Was already provided as a stand-alone function in the original software. |
+| OVERRIDE_MDF_NORM_LEARN_RATE_CALC | mdf_nominal_learning_rate_calc | Normal learning rate calculation once we're past the minimal adaptation phase. |
+| OVERRIDE_MDF_POWER_SPECTRUM | power_spectrum | Compute power spectrum of a complex vector. |
+| OVERRIDE_MDF_POWER_SPECTRUM_ACCUM | power_spectrum_accum | Same as power_spectrum above, with extra accumulation. |
+| OVERRIDE_MDF_PREEMPH_FLT | mdf_preemph | Copy spx_word16_(int16_t) input data to buffer and apply pre-emphasis filter. |
+| OVERRIDE_MDF_SMOOTHED_ADD | smoothed_add | Blend error and echo residual to apply a smooth transition to avoid introducing blocking artifacts. |
+| OVERRIDE_MDF_SMOOTH_FE_NRG | smooth_fe_nrg | Smooth far end energy estimate over time. |
+| OVERRIDE_MDF_SPECTRAL_MUL_ACCUM | spectral_mul_accum | Compute cross-power spectrum of a complex vectors and accumulate. Only relevant for fixed-point as mixes spx_word16_t and spx_word32_t. Floating point version, used in Audiomark, has only plain floating point  versions and does not distinguish with spectral_mul_accum16. |
+| OVERRIDE_MDF_SPECTRAL_MUL_ACCUM16 | spectral_mul_accum16 | Same as spectral_mul_accum above but plain spx_word16_t format. Floating point version, used in Audiomark, has only plain floating point versions and does not distinguish with spectral_mul_accum. |
+| OVERRIDE_MDF_STRIDED_PREEMPH_FLT | mdf_preemph_with_stride_int | Strided spx_int16_t (int16_t) pre-emphasis filter with saturation check. |
+| OVERRIDE_MDF_VEC_ADD | vect_add | spx_word16_t (Floating point) vector addition. |
+| OVERRIDE_MDF_VEC_CLEAR | vect_clear | spx_word16_t (Floating point) vector clear. |
+| OVERRIDE_MDF_VEC_COPY | vect_copy | spx_word16_t (Floating point) vector copy. |
+| OVERRIDE_MDF_VEC_MULT | vect_mult | spx_word16_t (Floating point) vector multiplication for windowing. |
+| OVERRIDE_MDF_VEC_SCALE | vect_scale | spx_word16_t (Floating point) vector scaling. |
+| OVERRIDE_MDF_VEC_SUB | vect_sub | spx_word16_t (Floating point) vector subtraction. |
+| OVERRIDE_MDF_VEC_SUB_INT16 | vect_sub16 | spx_int16_t (16-bit signed integer point) vector subtraction for filtered echo computation, difference of AEC input and output subframes. |
+| OVERRIDE_MDF_WEIGHT_SPECT_MUL_CONJ | weighted_spectral_mul_conj | Compute weighted cross-power spectrum of a complex vector with conjugate. |
+| OVERRIDE_ANR_APOSTERIORI_SNR | aposteriori_snr | Compute A-posteriori / A-priori SNRs. |
+| OVERRIDE_ANR_APPLY_SPEC_GAIN | apply_spectral_gain | Apply computed spectral gain. |
+| OVERRIDE_ANR_COMPUTE_GAIN_FLOOR | compute_gain_floor | Compute the gain floor based on different floors for the background noise and residual echo. |
+| OVERRIDE_ANR_HYPERGEOM_GAIN | hypergeom_gain | compute hypergeometric function. |
+| OVERRIDE_ANR_OLA | vect_ola | spx_word16_t vector overlap and add. |
+| OVERRIDE_ANR_POWER_SPECTRUM | power_spectrum | Complex magnitude squared of a spx_word16_t (floating-point) vector. |
+| OVERRIDE_ANR_QCURVE | qcurve | Compute 1 / (1 + 0.15 / (SNR_SCALING_1 * x)) |
+| OVERRIDE_ANR_UPDATE_GAINS_CRITICAL_BANDS | update_gains_critical_bands | Update gains in critical bands (MEL scale). |
+| OVERRIDE_ANR_UPDATE_GAINS_LINEAR | update_gains_linear | Update gains in linear spectral bands. |
+| OVERRIDE_ANR_UPDATE_NOISE_ESTIMATE | update_noise_estimate | Update noise estimates. |
+| OVERRIDE_ANR_UPDATE_NOISE_PROB | update_noise_prob | Update noise probabilities and smoothed power spectrum. |
+| OVERRIDE_ANR_UPDATE_ZETA | preprocess_update_zeta | Update Smoothed a priori SNR. |
+| OVERRIDE_ANR_VEC_CONV_FROM_INT16 | vect_conv_from_int16 | Convert spx_int16_t (16-bit signed integer) vector to spx_word16_t (floating-point). |
+| OVERRIDE_ANR_VEC_COPY | vect_copy* | Generic vector copy. |
+| OVERRIDE_ANR_VEC_MUL | vect_mult* | `spx_word16_t` vector multiplication for windowing. |
+| OVERRIDE_FB_COMPUTE_BANK32 | filterbank_compute_bank32 | Convert linear power spectrum in MEL perceptual scale. |
+| OVERRIDE_FB_COMPUTE_PSD16 | filterbank_compute_psd16 | Compute the linear power spectral density from MEL perceptual scale. |
+
+\* It can be noted that ANR vector multiplications and vector copy routines are similar to AEC ones and can be shared.
+
+# Memory model
+
+There are four types of memory required for the benchmark: input audio buffers,
+pre-defined inter-component buffers, constant tables, and component-specific
+scractch memory requests.
+
+## Input audio buffers
+
+Three channels of input audio are provided: left, right, and noise. There are
+roughly 1.69 seconds of audio in these buffers. A fourth buffer, `for_asr` is
+used for propagate the AEC output.
+
+## Inter-component buffers
+
+Each component connects to the other components or inputs via one or more
+buffers. These statically allocated buffers' storage is defined in `th_api.c`
+to allow the system integrator to better control placement.
+
+## Table constants
+
+All files with the name `*_tables.c` define arrays that are referenced via
+extern from their respective components. These array variables have been
+stored in their own source files to facilitate linker placement optimization.
+
+The adaptive beamformer, MFCC feature extractor, and neural net all have
+several large tables of constants.
+
+## Dynamic allocation for scratch memory
+
+Each component needs a certain amount of scratch memory. The components are
+written in such a way that they are first queried to indicate how much
+memory they require, and then that is allocated by the framework and provided
+via buffers during reset and run. This has been abstracted down to the 
+`th_malloc` function. The parameters to this function are the number of
+required bytes and the component that is requesting it. The system integrator
+can use the default STDLIB `malloc` function, or allocate their own memory
+buffers to target different types of memory. **The memory is never freed so
+there is no need to install a sophisticated memory manager.** Simply assigning
+subsequent address pointers is sufficient (provided there is enough memory).
+
+Both the LibSpeexDSP and EEMBC-provided components utilize this dynamic
+allocation pattern.
+
+# Coding conventions
+
+## Formatting
+
+EEMBC formats according to [Barr-C Embedded Standards](https://barrgroup.com/sites/default/files/barr_c_coding_standard_2018.pdf). The `.clang-format` file
+in the root directory observes this. This file can be used within VSCode or
+MSVC, however it isn't clear if this behaves the same as `clang-format`
+version 14 (which aligns pointer stars differently).
+
+## Functions and filenames
+
+Traditionally, all functions and files start with either `ee_` or `th_`, where
+the former notation indicates "thou shall not change" and the latter must be
+changed in order to port the code (i.e., the Test Harness, hence `th_`).
+
+Since there is so much code from Xiph that we are including, we will not change
+all of their code but might have to change some (like FFT wrappers). It may
+require a Run Rule to avoid this code being altered. Ideally every function
+should fall into a simple `th_api` folder or collection of files so that it is
+obvious what needs to be ported. Currently the `components/eembc` folder
+illustrates this by separating all of the Arm-specific code into `th_api.c`.
+
+# Run rules
+
+1. The minimum resolution of the system timer used for performance measurement shall be one (1) kHz.
+
+2. The minimum runtime shall be 10 seconds iterating on the `audiomark_run()` function. At least 10 iterations must complete in this 10 seconds, otherwise the runtime shall be whatever achieves 10 iterations. 10 seconds of runtime with a 1 ms resolution is effectively 0.01% measurement resolution.
+
+3. Only functions and files starting with `th_*` may be altered. In the case of the `th_api`, this is required. The one exception to this is the SpeeX DSP library code, where the user may modify the `wrapfft.c` to install the optimal FFT, change the override macros and provide their own associated functionality, or modify the primitive DSP math macros. All of the EEMBC provided `ee_` functions ultimately perform DSP computation through a subset of functions declared in the `ee_api.h` header, and implemented however the system integrator choses (the example puts all of the implementation reference code in the `th_api.c` file).
+
+4. For a score to be considered valid, it must pass the AEC, ANR, ABF, MFCC and KWS regression tests found in the `tests/` directory. The AEC, ANR, MFCC and ABF tests utilize a SNR ratio check of -50 dB over 62.5 ms frames of data. The KWS expects the softmax output to match the Top-1 prediction for each inference, and not the actual probability. This allows for flexibility in optimizing the API functions which may not be bit-exact, but still achieve roughly the same fidelity.
+
+5. All processing must be carried out on the platform locally and not sent to the cloud. The definition of a device includes single chip silicon, and multi-die modules, and multi-chip platforms. External memories are allowed.
+
+6. The AudioMark score shall be computed as "iterations per second * 1000 / 1.5". AudioMark/MHz is simply AudioMark divided by the highest core frequency in MHz. For example, if the benchmark is running on Core A at 100 Mhz and Core A uses a DSP peripheral running at 300 MHz, the highest frequency is used in the computation; in this case, 300 MHz. See footnote #1 below.
+
+
+7. The benchmark score shall be obtained with the serial computation of the following components--ABF, AEC, ANR, KWS--with execution proceeding from one component to another, on completion. The benchmark shall not be altered or implemented in any way that causes any of the components to execute in parallel.
+
+8. Running multiple processes of AudioMark on a platform is allowed. For example, a dual core product could run one instance of AudioMark (`time_audiomark_run()`) on each core  For N cores running the AudioMark concurrently:
+
+```
+    Define: AM(n) is defined in Rule #6. measured per core n
+    Define: f(n) is the clock frequency of core n
+    AudioMark (AM) = Sum {AM(n): n = 1 to N}
+    AudioMark/MHz (AM/MHz) = AM divided by max {f(n): n = 1 to N}
+```
+
+**Footnote #1: Explanation of scoring equation**
+
+First, the 1000 factor is introduced to scale the score into a preferred integer range, this is a common EEMBC technique to avoid comparing small fractional numbers. Second, a scaling ratio is added, which was originally intended to compensate for the ratio between the sampling rate and the number of samples being processed in each iteration. The benchmark assumes a pipeline operating on 16 kHz audio input, and the idea is that a platform that is operating efficiently would score 1.0 (with no scalar): it is running exactly at the speed needed, no faster, no slower. A platform with half the performance would measure 0.5 iterations per second. However, the benchmark input dataset is actually 24000 samples worth of data not 16000, so if one iteration completes in one second, the benchmark has in reality performed better than the 16 kHz design goal. To adjust for this, the score should have been multiplied by 1.5. (Note: The benchmark could reduce the number of samples to one second (or 16k samples), however, the lead-in silence is needed to stabilize the ANR, and the keyword utterance spills over the one-second mark of the sample.). However, due to an error a divisor of 1.5 was used in the released code. To avoid confusion for people that are already using AudioMark in their projects, we have decided to keep the current code.
+
+# Submitting scores
+
+1. A score must be submitted to the website before it can be used in any external publication such as: academic journals, technical papers, and marketing assets. An unsubmitted score may only be discussed internally or with a 3rd party under NDA (Non-Disclosure Agreement), but not presented to the public in any way. Note: the score may be submitted but not go live (i.e., visible on the web) until a certain date agreed to between the submitter and EEMBC. This is to account for product launch schedules.
+
+2. A submitted score shall be from hardware that is available for purchase to any member of the general public. Scores for hardware that is yet to be announced must be marked "preliminary". This score may be superseded by a new score on the released product, or cleared by request to EEMBC after the product is launched.
+
+3. A score collected from simulation cannot be submitted. The measured score must come from actual silicon. This includes CPU, MCU, MPU, SoC, and FPGA prototypes.
+
+# Credits
+
+This benchmark would not have been possible without the commitment and contributions of the working group members, and the assistance from various domain experts, including (sorted by given name):
+
+* Ashutosh Pandey, Infineon (Technical Lead)
+* Dmitry Utyansky, Synopsys
+* Fabien Klein, Arm
+* Felix Johnny Thomasmathibalan, Arm
+* Gary Jacobson: Renesas
+* Jim Ryan, onsemi
+* Joseph Yiu, Arm
+* Kaiping Lee, Infineon
+* Laurent Le Faucheur, Arm
+* Mark Wallis, STMicroelectronics
+* Nagendra (GD) Gulur Dwarakanath, Texas Instruments
+* Peter Torelli, EEMBC
+* Rita Chattopadhyay, Intel
+* Ruud Derwig, Synopsys
+
+# Copyright and license
+
+Copyright (C) EEMBC, All rights reserved. Please refer to LICENSE.md.

From ce7a2c7e47368f4bba2921f44711640a9eacd5b4 Mon Sep 17 00:00:00 2001
From: Joseph Yiu <77114984+joseph-yiu@users.noreply.github.com>
Date: Tue, 11 Jun 2024 20:56:42 +0100
Subject: [PATCH 3/7] Remove most of the float64 operations

---
 lib/speexdsp/include/speex/speex_echo.h       |  2 +-
 lib/speexdsp/libspeexdsp/arch.h               |  3 +-
 lib/speexdsp/libspeexdsp/filterbank.c         |  4 +-
 lib/speexdsp/libspeexdsp/fixed_debug.h        |  4 +-
 lib/speexdsp/libspeexdsp/fixed_generic.h      |  5 +-
 lib/speexdsp/libspeexdsp/math_approx.h        | 10 ++--
 lib/speexdsp/libspeexdsp/mdf.c                | 60 +++++++++----------
 lib/speexdsp/libspeexdsp/mdf_opt_generic.c    | 18 +++---
 lib/speexdsp/libspeexdsp/mdf_opt_helium.c     | 12 ++--
 lib/speexdsp/libspeexdsp/preprocess.c         | 36 +++++------
 .../libspeexdsp/preprocess_opt_generic.c      | 16 ++---
 lib/speexdsp/libspeexdsp/resample.c           |  2 +-
 lib/speexdsp/libspeexdsp/scal.c               |  2 +-
 main.c                                        |  6 +-
 src/ee_mfcc_f32.h                             |  4 +-
 15 files changed, 93 insertions(+), 91 deletions(-)

diff --git a/lib/speexdsp/include/speex/speex_echo.h b/lib/speexdsp/include/speex/speex_echo.h
index 716b306..3ee0d3a 100644
--- a/lib/speexdsp/include/speex/speex_echo.h
+++ b/lib/speexdsp/include/speex/speex_echo.h
@@ -45,7 +45,7 @@ extern "C" {
 #endif
 
 #ifndef M_PI
-#define M_PI 3.14159265358979323846
+#define M_PI 3.14159265358979323846f
 #endif
 
 #ifdef FIXED_POINT
diff --git a/lib/speexdsp/libspeexdsp/arch.h b/lib/speexdsp/libspeexdsp/arch.h
index 1cac3d9..39ab743 100644
--- a/lib/speexdsp/libspeexdsp/arch.h
+++ b/lib/speexdsp/libspeexdsp/arch.h
@@ -149,6 +149,7 @@ typedef float spx_word32_t;
 #define VERY_LARGE16 1e15f
 #define Q15_ONE ((spx_word16_t)1.f)
 
+#define Q0CONST(x) ((float)(x))
 #define QCONST16(x,bits) (x)
 #define QCONST32(x,bits) (x)
 
@@ -203,7 +204,7 @@ typedef float spx_word32_t;
 #define PDIV32(a,b)     (((spx_word32_t)(a))/(spx_word32_t)(b))
 
 #define WORD2INT(x) ((x) < -32767.5f ? -32768 : \
-                    ((x) > 32766.5f ? 32767 : (spx_int16_t)floor(.5 + (x))))
+                    ((x) > 32766.5f ? 32767 : (spx_int16_t)floorf(.5f + (x))))
 #endif
 
 
diff --git a/lib/speexdsp/libspeexdsp/filterbank.c b/lib/speexdsp/libspeexdsp/filterbank.c
index afb277d..4114aaf 100644
--- a/lib/speexdsp/libspeexdsp/filterbank.c
+++ b/lib/speexdsp/libspeexdsp/filterbank.c
@@ -46,10 +46,10 @@
 #define toBARK(n)   (MULT16_16(26829,spx_atan(SHR32(MULT16_16(97,n),2))) + MULT16_16(4588,spx_atan(MULT16_32_Q15(20,MULT16_16(n,n)))) + MULT16_16(3355,n))
 
 #else
-#define toBARK(n)   (13.1f*atan(.00074f*(n))+2.24f*atan((n)*(n)*1.85e-8f)+1e-4f*(n))
+#define toBARK(n)   (13.1f*atanf(.00074f*(n))+2.24f*atanf((n)*(n)*1.85e-8f)+1e-4f*(n))
 #endif
 
-#define toMEL(n)    (2595.f*log10(1.f+(n)/700.f))
+#define toMEL(n)    (2595.f*log10f(1.f+(n)/700.f))
 
 /* Optimized filter bank routines */
 #include "filterbank_opt.c"
diff --git a/lib/speexdsp/libspeexdsp/fixed_debug.h b/lib/speexdsp/libspeexdsp/fixed_debug.h
index dbf02f1..f7f9ee8 100644
--- a/lib/speexdsp/libspeexdsp/fixed_debug.h
+++ b/lib/speexdsp/libspeexdsp/fixed_debug.h
@@ -40,8 +40,8 @@
 extern long long spx_mips;
 #define MIPS_INC spx_mips++,
 
-#define QCONST16(x,bits) ((spx_word16_t)(.5+(x)*(((spx_word32_t)1)<<(bits))))
-#define QCONST32(x,bits) ((spx_word32_t)(.5+(x)*(((spx_word32_t)1)<<(bits))))
+#define QCONST16(x,bits) ((spx_word16_t)(.5f+(x)*(((spx_word32_t)1)<<(bits))))
+#define QCONST32(x,bits) ((spx_word32_t)(.5f+(x)*(((spx_word32_t)1)<<(bits))))
 
 
 #define VERIFY_SHORT(x) ((x)<=32767&&(x)>=-32768)
diff --git a/lib/speexdsp/libspeexdsp/fixed_generic.h b/lib/speexdsp/libspeexdsp/fixed_generic.h
index 09366c3..0a77fdd 100644
--- a/lib/speexdsp/libspeexdsp/fixed_generic.h
+++ b/lib/speexdsp/libspeexdsp/fixed_generic.h
@@ -35,8 +35,9 @@
 #ifndef FIXED_GENERIC_H
 #define FIXED_GENERIC_H
 
-#define QCONST16(x,bits) ((spx_word16_t)(.5+(x)*(((spx_word32_t)1)<<(bits))))
-#define QCONST32(x,bits) ((spx_word32_t)(.5+(x)*(((spx_word32_t)1)<<(bits))))
+#define Q0CONST(x)       (x)
+#define QCONST16(x,bits) ((spx_word16_t)(.5f+(x)*(((spx_word32_t)1)<<(bits))))
+#define QCONST32(x,bits) ((spx_word32_t)(.5f+(x)*(((spx_word32_t)1)<<(bits))))
 
 #define NEG16(x) (-(x))
 #define NEG32(x) (-(x))
diff --git a/lib/speexdsp/libspeexdsp/math_approx.h b/lib/speexdsp/libspeexdsp/math_approx.h
index 596dfdc..5fc760a 100644
--- a/lib/speexdsp/libspeexdsp/math_approx.h
+++ b/lib/speexdsp/libspeexdsp/math_approx.h
@@ -39,11 +39,11 @@
 
 #ifndef FIXED_POINT
 
-#define spx_sqrt sqrt
-#define spx_acos acos
-#define spx_exp exp
-#define spx_cos_norm(x) (cos((.5f*M_PI)*(x)))
-#define spx_atan atan
+#define spx_sqrt sqrtf
+#define spx_acos acosf
+#define spx_exp expf
+#define spx_cos_norm(x) (cosf((.5f*M_PI)*(x)))
+#define spx_atan atanf
 
 /** Generate a pseudo-random number */
 static inline spx_word16_t speex_rand(spx_word16_t std, spx_int32_t *seed)
diff --git a/lib/speexdsp/libspeexdsp/mdf.c b/lib/speexdsp/libspeexdsp/mdf.c
index feab962..37397b1 100644
--- a/lib/speexdsp/libspeexdsp/mdf.c
+++ b/lib/speexdsp/libspeexdsp/mdf.c
@@ -181,9 +181,9 @@ static inline void filter_dc_notch16(const spx_int16_t *in, spx_word16_t radius,
    int i;
    spx_word16_t den2;
 #ifdef FIXED_POINT
-   den2 = MULT16_16_Q15(radius,radius) + MULT16_16_Q15(QCONST16(.7,15),MULT16_16_Q15(32767-radius,32767-radius));
+   den2 = MULT16_16_Q15(radius,radius) + MULT16_16_Q15(QCONST16(.7f,15),MULT16_16_Q15(32767-radius,32767-radius));
 #else
-   den2 = radius*radius + .7*(1-radius)*(1-radius);
+   den2 = radius*radius + .7f*(1.0f-radius)*(1.0f-radius);
 #endif
    /*printf ("%d %d %d %d %d %d\n", num[0], num[1], num[2], den[0], den[1], den[2]);*/
    for (i=0;i<len;i++)
@@ -193,7 +193,7 @@ static inline void filter_dc_notch16(const spx_int16_t *in, spx_word16_t radius,
 #ifdef FIXED_POINT
       mem[0] = mem[1] + SHL32(SHL32(-EXTEND32(vin),15) + MULT16_32_Q15(radius,vout),1);
 #else
-      mem[0] = mem[1] + 2*(-vin + radius*vout);
+      mem[0] = mem[1] + 2.0f*(-vin + radius*vout);
 #endif
       mem[1] = SHL32(EXTEND32(vin),15) - MULT16_32_Q15(den2,vout);
       out[i] = SATURATE32(PSHR32(MULT16_32_Q15(radius,vout),15),32767);
@@ -485,7 +485,7 @@ EXPORT SpeexEchoState *speex_echo_state_init_mc(int frame_size, int filter_lengt
    }
 #else
    for (i=0;i<N;i++)
-      st->window[i] = .5-.5*cos(2*M_PI*i/N);
+      st->window[i] = .5f-.5f*cosf(2*M_PI*i/N);
 #endif
    for (i=0;i<=st->frame_size;i++)
       st->power_1[i] = FLOAT_ONE;
@@ -494,8 +494,8 @@ EXPORT SpeexEchoState *speex_echo_state_init_mc(int frame_size, int filter_lengt
    {
       spx_word32_t sum = 0;
       /* Ratio of ~10 between adaptation rate of first and last block */
-      spx_word16_t decay = SHR32(spx_exp(NEG16(DIV32_16(QCONST16(2.4,11),M))),1);
-      st->prop[0] = QCONST16(.7, 15);
+      spx_word16_t decay = SHR32(spx_exp(NEG16(DIV32_16(QCONST16(2.4f,11),M))),1);
+      st->prop[0] = QCONST16(.7f, 15);
       sum = EXTEND32(st->prop[0]);
       for (i=1;i<M;i++)
       {
@@ -511,13 +511,13 @@ EXPORT SpeexEchoState *speex_echo_state_init_mc(int frame_size, int filter_lengt
    st->memX = (spx_word16_t*)speex_alloc(K*sizeof(spx_word16_t));
    st->memD = (spx_word16_t*)speex_alloc(C*sizeof(spx_word16_t));
    st->memE = (spx_word16_t*)speex_alloc(C*sizeof(spx_word16_t));
-   st->preemph = QCONST16(.9,15);
+   st->preemph = QCONST16(.9f,15);
    if (st->sampling_rate<12000)
-      st->notch_radius = QCONST16(.9, 15);
+      st->notch_radius = QCONST16(.9f, 15);
    else if (st->sampling_rate<24000)
-      st->notch_radius = QCONST16(.982, 15);
+      st->notch_radius = QCONST16(.982f, 15);
    else
-      st->notch_radius = QCONST16(.992, 15);
+      st->notch_radius = QCONST16(.992f, 15);
 
    st->notch_mem = (spx_mem_t*)speex_alloc(2*C*sizeof(spx_mem_t));
    st->adapted = 0;
@@ -724,8 +724,8 @@ EXPORT void speex_echo_cancellation(SpeexEchoState *st, const spx_int16_t *in, c
    ss=DIV32_16(11469,M);
    ss_1 = SUB16(32767,ss);
 #else
-   ss=.35/M;
-   ss_1 = 1-ss;
+   ss=.35f/M;
+   ss_1 = 1.0f-ss;
 #endif
 
    for (chan = 0; chan < C; chan++)
@@ -1103,7 +1103,7 @@ EXPORT void speex_echo_cancellation(SpeexEchoState *st, const spx_int16_t *in, c
    /* Do some sanity check */
    if (!(Syy>=0 && Sxx>=0 && See >= 0)
 #ifndef FIXED_POINT
-       || !(Sff < N*1e9 && Syy < N*1e9 && Sxx < N*1e9)
+       || !(Sff < N*1e9f && Syy < N*1e9f && Sxx < N*1e9f)
 #endif
       )
    {
@@ -1142,7 +1142,7 @@ EXPORT void speex_echo_cancellation(SpeexEchoState *st, const spx_int16_t *in, c
 #ifndef OVERRIDE_MDF_SMOOTH_FE_NRG
    /* Smooth far end energy estimate over time */
    for (j=0;j<=st->frame_size;j++)
-      st->power[j] = MULT16_32_Q15(ss_1,st->power[j]) + 1 + MULT16_32_Q15(ss,st->Xf[j]);
+      st->power[j] = MULT16_32_Q15(ss_1,st->power[j]) + Q0CONST(1) + MULT16_32_Q15(ss,st->Xf[j]);
 #else
    smooth_fe_nrg(st->power, ss_1, st->Xf, ss, st->power, st->frame_size + 1);
 #endif
@@ -1211,12 +1211,12 @@ EXPORT void speex_echo_cancellation(SpeexEchoState *st, const spx_int16_t *in, c
       tmp32 = SHR32(See,1);
    RER = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIV32(tmp32,See),15));
 #else
-   RER = (.0001*Sxx + 3.*MULT16_32_Q15(st->leak_estimate,Syy)) / See;
+   RER = (.0001f*Sxx + 3.0f*MULT16_32_Q15(st->leak_estimate,Syy)) / See;
    /* Check for y in e (lower bound on RER) */
-   if (RER < Sey*Sey/(1+See*Syy))
-      RER = Sey*Sey/(1+See*Syy);
-   if (RER > .5)
-      RER = .5;
+   if (RER < Sey*Sey/(1.0f+See*Syy))
+      RER = Sey*Sey/(1.0f+See*Syy);
+   if (RER > .5f)
+      RER = .5f;
 #endif
 
    /* We consider that the filter has had minimal adaptation if the following is true*/
@@ -1239,12 +1239,12 @@ EXPORT void speex_echo_cancellation(SpeexEchoState *st, const spx_int16_t *in, c
          if (r>SHR32(e,1))
             r = SHR32(e,1);
 #else
-         if (r>.5*e)
-            r = .5*e;
+         if (r>.5f*e)
+            r = .5f*e;
 #endif
-         r = MULT16_32_Q15(QCONST16(.7,15),r) + MULT16_32_Q15(QCONST16(.3,15),(spx_word32_t)(MULT16_32_Q15(RER,e)));
+         r = MULT16_32_Q15(QCONST16(.7f,15),r) + MULT16_32_Q15(QCONST16(.3f,15),(spx_word32_t)(MULT16_32_Q15(RER,e)));
          /*st->power_1[i] = adapt_rate*r/(e*(1+st->power[i]));*/
-         st->power_1[i] = FLOAT_SHL(FLOAT_DIV32_FLOAT(r,FLOAT_MUL32U(e,st->power[i]+10)),WEIGHT_SHIFT+16);
+         st->power_1[i] = FLOAT_SHL(FLOAT_DIV32_FLOAT(r,FLOAT_MUL32U(e,st->power[i]+Q0CONST(10))),WEIGHT_SHIFT+16);
       }
 #else
       mdf_nominal_learning_rate_calc(st->Rf, st->power, st->Yf, st->power_1, st->leak_estimate, RER, st->frame_size + 1);
@@ -1260,14 +1260,14 @@ EXPORT void speex_echo_cancellation(SpeexEchoState *st, const spx_int16_t *in, c
          if (tmp32 > SHR32(See,2))
             tmp32 = SHR32(See,2);
 #else
-         if (tmp32 > .25*See)
-            tmp32 = .25*See;
+         if (tmp32 > .25f*See)
+            tmp32 = .25f*See;
 #endif
          adapt_rate = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIV32(tmp32, See),15));
       }
 #ifndef OVERRIDE_MDF_CONVERG_LEARN_RATE_CALC
       for (i=0;i<=st->frame_size;i++)
-         st->power_1[i] = FLOAT_SHL(FLOAT_DIV32(EXTEND32(adapt_rate),ADD32(st->power[i],10)),WEIGHT_SHIFT+1);
+         st->power_1[i] = FLOAT_SHL(FLOAT_DIV32(EXTEND32(adapt_rate),ADD32(st->power[i],Q0CONST(10))),WEIGHT_SHIFT+1);
 #else
        mdf_non_adapt_learning_rate_calc(st->power, st->power_1, adapt_rate, st->frame_size + 1);
 #endif
@@ -1328,7 +1328,7 @@ void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *residual_echo, in
    else
       leak2 = SHL16(st->leak_estimate, 1);
 #else
-   if (st->leak_estimate>.5)
+   if (st->leak_estimate>.5f)
       leak2 = 1;
    else
       leak2 = 2*st->leak_estimate;
@@ -1362,11 +1362,11 @@ EXPORT int speex_echo_ctl(SpeexEchoState *st, int request, void *ptr)
          st->beta_max = (.5f*st->frame_size)/st->sampling_rate;
 #endif
          if (st->sampling_rate<12000)
-            st->notch_radius = QCONST16(.9, 15);
+            st->notch_radius = QCONST16(.9f, 15);
          else if (st->sampling_rate<24000)
-            st->notch_radius = QCONST16(.982, 15);
+            st->notch_radius = QCONST16(.982f, 15);
          else
-            st->notch_radius = QCONST16(.992, 15);
+            st->notch_radius = QCONST16(.992f, 15);
          break;
       case SPEEX_ECHO_GET_SAMPLING_RATE:
          (*(int*)ptr) = st->sampling_rate;
diff --git a/lib/speexdsp/libspeexdsp/mdf_opt_generic.c b/lib/speexdsp/libspeexdsp/mdf_opt_generic.c
index a65ed2e..5b1276a 100755
--- a/lib/speexdsp/libspeexdsp/mdf_opt_generic.c
+++ b/lib/speexdsp/libspeexdsp/mdf_opt_generic.c
@@ -45,9 +45,9 @@ static void filter_dc_notch16(const spx_int16_t * in, spx_word16_t radius, spx_w
     int             i;
     spx_word16_t    den2;
 #ifdef FIXED_POINT
-    den2 = MULT16_16_Q15(radius, radius) + MULT16_16_Q15(QCONST16(.7, 15), MULT16_16_Q15(32767 - radius, 32767 - radius));
+    den2 = MULT16_16_Q15(radius, radius) + MULT16_16_Q15(QCONST16(.7f, 15), MULT16_16_Q15(32767 - radius, 32767 - radius));
 #else
-    den2 = radius * radius + .7 * (1 - radius) * (1 - radius);
+    den2 = radius * radius + .7f * (1.0f - radius) * (1.0f - radius);
 #endif
     /*printf ("%d %d %d %d %d %d\n", num[0], num[1], num[2], den[0], den[1], den[2]); */
     for (i = 0; i < len; i++) {
@@ -56,7 +56,7 @@ static void filter_dc_notch16(const spx_int16_t * in, spx_word16_t radius, spx_w
 #ifdef FIXED_POINT
         mem[0] = mem[1] + SHL32(SHL32(-EXTEND32(vin), 15) + MULT16_32_Q15(radius, vout), 1);
 #else
-        mem[0] = mem[1] + 2 * (-vin + radius * vout);
+        mem[0] = mem[1] + 2.0f * (-vin + radius * vout);
 #endif
         mem[1] = SHL32(EXTEND32(vin), 15) - MULT16_32_Q15(den2, vout);
         out[i] = SATURATE32(PSHR32(MULT16_32_Q15(radius, vout), 15), 32767);
@@ -415,7 +415,7 @@ static void smooth_fe_nrg(spx_word32_t * in1, spx_word16_t c1, spx_word32_t * in
     int             j;
 
     for (j = 0; j <= frame_size; j++)
-        pDst[j] = MULT16_32_Q15(c1, in1[j]) + 1 + MULT16_32_Q15(c2, in2[j]);
+        pDst[j] = MULT16_32_Q15(c1, in1[j]) + Q0CONST(1) + MULT16_32_Q15(c2, in2[j]);
 }
 #endif
 
@@ -458,12 +458,12 @@ static void mdf_nominal_learning_rate_calc(spx_word32_t * pRf, spx_word32_t * po
         if (r > SHR32(e, 1))
             r = SHR32(e, 1);
 #else
-        if (r > .5 * e)
-            r = .5 * e;
+        if (r > .5f * e)
+            r = .5f * e;
 #endif
-        r = MULT16_32_Q15(QCONST16(.7, 15), r) + MULT16_32_Q15(QCONST16(.3, 15), (spx_word32_t) (MULT16_32_Q15(RER, e)));
+        r = MULT16_32_Q15(QCONST16(.7f, 15), r) + MULT16_32_Q15(QCONST16(.3f, 15), (spx_word32_t) (MULT16_32_Q15(RER, e)));
         /*st->power_1[i] = adapt_rate*r/(e*(1+st->power[i])); */
-        power_1[i] = FLOAT_SHL(FLOAT_DIV32_FLOAT(r, FLOAT_MUL32U(e, power[i] + 10)), WEIGHT_SHIFT + 16);
+        power_1[i] = FLOAT_SHL(FLOAT_DIV32_FLOAT(r, FLOAT_MUL32U(e, power[i] + 10.f)), WEIGHT_SHIFT + 16);
     }
 }
 
@@ -475,7 +475,7 @@ static void mdf_non_adapt_learning_rate_calc(spx_word32_t * power, spx_float_t *
     int             i;
 
     for (i = 0; i < frame_size; i++)
-        power_1[i] = FLOAT_SHL(FLOAT_DIV32(EXTEND32(adapt_rate), ADD32(power[i], 10)), WEIGHT_SHIFT + 1);
+        power_1[i] = FLOAT_SHL(FLOAT_DIV32(EXTEND32(adapt_rate), ADD32(power[i], Q0CONST(10))), WEIGHT_SHIFT + 1);
 }
 
 #endif
diff --git a/lib/speexdsp/libspeexdsp/mdf_opt_helium.c b/lib/speexdsp/libspeexdsp/mdf_opt_helium.c
index ef0891d..0f74810 100755
--- a/lib/speexdsp/libspeexdsp/mdf_opt_helium.c
+++ b/lib/speexdsp/libspeexdsp/mdf_opt_helium.c
@@ -733,8 +733,8 @@ VISIB_ATTR void mdf_nominal_learning_rate_calc(spx_word32_t * pRf, spx_word32_t
                                          spx_word32_t * pYf, spx_float_t * power_1, spx_word16_t leak_estimate, spx_word16_t RER, uint16_t len)
 {
     int             blockSize = len >> 2;
-    float32_t       cst_0_7 = QCONST16(.7, 15);
-    float32_t       cst_0_3 = QCONST16(.3, 15);
+    float32_t       cst_0_7 = QCONST16(.7f, 15);
+    float32_t       cst_0_3 = QCONST16(.3f, 15);
 
     do {
         float32x4_t     vecYf = vld1q(pYf);
@@ -771,10 +771,10 @@ VISIB_ATTR void mdf_nominal_learning_rate_calc(spx_word32_t * pRf, spx_word32_t
         r = MULT16_32_Q15(leak_estimate, SHL32(*pYf++, 3));
         e = SHL32(*pRf++, 3) + 1;
 
-        if (r > .5 * e)
-            r = .5 * e;
+        if (r > .5f * e)
+            r = .5f * e;
 
-        r = MULT16_32_Q15(QCONST16(.7, 15), r) + MULT16_32_Q15(QCONST16(.3, 15), (spx_word32_t) (MULT16_32_Q15(RER, e)));
+        r = MULT16_32_Q15(QCONST16(.7f, 15), r) + MULT16_32_Q15(QCONST16(.3f, 15), (spx_word32_t) (MULT16_32_Q15(RER, e)));
         /*st->power_1[i] = adapt_rate*r/(e*(1+st->power[i])); */
         *power_1++ = FLOAT_SHL(FLOAT_DIV32_FLOAT(r, FLOAT_MUL32U(e, *power++ + 10)), WEIGHT_SHIFT + 16);
     }
@@ -804,7 +804,7 @@ VISIB_ATTR void mdf_non_adapt_learning_rate_calc(spx_word32_t * power, spx_float
 
     /* tail */
     for (int i = 0; i <= (len & 3); i++) {
-        *power_1++ = FLOAT_SHL(FLOAT_DIV32(EXTEND32(adapt_rate), ADD32(*power++, 10)), WEIGHT_SHIFT + 1);
+        *power_1++ = FLOAT_SHL(FLOAT_DIV32(EXTEND32(adapt_rate), ADD32(*power++, Q0CONST(10))), WEIGHT_SHIFT + 1);
     }
 }
 
diff --git a/lib/speexdsp/libspeexdsp/preprocess.c b/lib/speexdsp/libspeexdsp/preprocess.c
index c9f37ca..e14790b 100644
--- a/lib/speexdsp/libspeexdsp/preprocess.c
+++ b/lib/speexdsp/libspeexdsp/preprocess.c
@@ -359,14 +359,14 @@ static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
       1.94811f, 2.07038f, 2.18638f, 2.29688f, 2.40255f, 2.50391f, 2.60144f,
       2.69551f, 2.78647f, 2.87458f, 2.96015f, 3.04333f, 3.12431f, 3.20326f};
       x = EXPIN_SCALING_1*xx;
-      integer = floor(2*x);
+      integer = floorf(2*x);
       ind = (int)integer;
       if (ind<0)
          return FRAC_SCALING;
       if (ind>19)
-         return FRAC_SCALING*(1+.1296/x);
-      frac = 2*x-integer;
-      return FRAC_SCALING*((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f);
+         return FRAC_SCALING*(1.0f+.1296f/x);
+      frac = 2.0f*x-integer;
+      return FRAC_SCALING*((1.0f-frac)*table[ind] + frac*table[ind+1])/sqrtf(x+.0001f);
 }
 #endif
 
@@ -384,12 +384,12 @@ static void compute_gain_floor(int noise_suppress, int effective_echo_suppress,
    float echo_floor;
    float noise_floor;
 
-   noise_floor = exp(.2302585f*noise_suppress);
-   echo_floor = exp(.2302585f*effective_echo_suppress);
+   noise_floor = expf(.2302585f*noise_suppress);
+   echo_floor = expf(.2302585f*effective_echo_suppress);
 
    /* Compute the gain floor based on different floors for the background noise and residual echo */
    for (i=0;i<len;i++)
-      gain_floor[i] = FRAC_SCALING*sqrt(noise_floor*PSHR32(noise[i],NOISE_SHIFT) + echo_floor*echo[i])/sqrt(1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]);
+      gain_floor[i] = FRAC_SCALING*sqrtf(noise_floor*PSHR32(noise[i],NOISE_SHIFT) + echo_floor*echo[i])/sqrtf(1.0f+PSHR32(noise[i],NOISE_SHIFT) + echo[i]);
 }
 #endif
 
@@ -586,27 +586,27 @@ static void speex_compute_agc(SpeexPreprocessState *st, spx_word16_t Pframe, spx
    {
       loudness += 2.f*N*st->ps[i]* st->loudness_weight[i];
    }
-   loudness=sqrt(loudness);
+   loudness=sqrtf(loudness);
       /*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) &&
    loudness*2 > pow(st->loudness, 1.0/LOUDNESS_EXP))*/
    if (Pframe>.3f)
    {
       /*rate=2.0f*Pframe*Pframe/(1+st->nb_loudness_adapt);*/
-      rate = .03*Pframe*Pframe;
-      st->loudness = (1-rate)*st->loudness + (rate)*pow(AMP_SCALE*loudness, LOUDNESS_EXP);
-      st->loudness_accum = (1-rate)*st->loudness_accum + rate;
+      rate = .03f*Pframe*Pframe;
+      st->loudness = (1.0f-rate)*st->loudness + (rate)*powf(AMP_SCALE*loudness, LOUDNESS_EXP);
+      st->loudness_accum = (1.0f-rate)*st->loudness_accum + rate;
       if (st->init_max < st->max_gain && st->nb_adapt > 20)
          st->init_max *= 1.f + .1f*Pframe*Pframe;
    }
    /*printf ("%f %f %f %f\n", Pframe, loudness, pow(st->loudness, 1.0f/LOUDNESS_EXP), st->loudness2);*/
 
-   target_gain = AMP_SCALE*st->agc_level*pow(st->loudness/(1e-4+st->loudness_accum), -1.0f/LOUDNESS_EXP);
+   target_gain = AMP_SCALE*st->agc_level*powf(st->loudness/(1e-4f+st->loudness_accum), -1.0f/LOUDNESS_EXP);
 
-   if ((Pframe>.5  && st->nb_adapt > 20) || target_gain < st->agc_gain)
+   if ((Pframe>.5f  && st->nb_adapt > 20) || target_gain < st->agc_gain)
    {
       if (target_gain > st->max_increase_step*st->agc_gain)
          target_gain = st->max_increase_step*st->agc_gain;
-      if (target_gain < st->max_decrease_step*st->agc_gain && loudness < 10*st->prev_loudness)
+      if (target_gain < st->max_decrease_step*st->agc_gain && loudness < 10.f*st->prev_loudness)
          target_gain = st->max_decrease_step*st->agc_gain;
       if (target_gain > st->max_gain)
          target_gain = st->max_gain;
@@ -774,7 +774,7 @@ EXPORT int speex_preprocess_run(SpeexPreprocessState *st, spx_int16_t *x)
       st->nb_adapt = 20000;
    st->min_count++;
 
-   beta = MAX16(QCONST16(.03,15),DIV32_16(Q15_ONE,st->nb_adapt));
+   beta = MAX16(QCONST16(.03f,15),DIV32_16(Q15_ONE,st->nb_adapt));
    beta_1 = Q15_ONE-beta;
    M = st->nbands;
    /* Deal with residual echo if provided */
@@ -911,7 +911,7 @@ EXPORT int speex_preprocess_run(SpeexPreprocessState *st, spx_int16_t *x)
 /*Q8*/tmp = EXTRACT16(PSHR32(MULT16_16(PDIV32_16(SHL32(EXTEND32(q),8),(Q15_ONE-q)),tmp),8));
       st->gain2[i]=DIV32_16(SHL32(EXTEND32(32767),SNR_SHIFT), ADD16(256,tmp));
 #else
-      st->gain2[i]=1/(1.f + (q/(1.f-q))*(1+st->prior[i])*exp(-theta));
+      st->gain2[i]=1.f/(1.f + (q/(1.f-q))*(1.f+st->prior[i])*expf(-theta));
 #endif
    }
 
@@ -1025,8 +1025,8 @@ EXPORT int speex_preprocess_run(SpeexPreprocessState *st, spx_int16_t *x)
    {
       float max_sample=0;
       for (i=0;i<2*N;i++)
-         if (fabs(st->frame[i])>max_sample)
-            max_sample = fabs(st->frame[i]);
+         if (fabsf(st->frame[i])>max_sample)
+            max_sample = fabsf(st->frame[i]);
       if (max_sample>28000.f)
       {
          float damp = 28000.f/max_sample;
diff --git a/lib/speexdsp/libspeexdsp/preprocess_opt_generic.c b/lib/speexdsp/libspeexdsp/preprocess_opt_generic.c
index 9b91b2f..07fe667 100755
--- a/lib/speexdsp/libspeexdsp/preprocess_opt_generic.c
+++ b/lib/speexdsp/libspeexdsp/preprocess_opt_generic.c
@@ -88,13 +88,13 @@ static void compute_gain_floor(int noise_suppress, int effective_echo_suppress,
     float           echo_floor;
     float           noise_floor;
 
-    noise_floor = exp(.2302585f * noise_suppress);
-    echo_floor = exp(.2302585f * effective_echo_suppress);
+    noise_floor = expf(.2302585f * noise_suppress);
+    echo_floor = expf(.2302585f * effective_echo_suppress);
 
     /* Compute the gain floor based on different floors for the background noise and residual echo */
     for (i = 0; i < len; i++)
         gain_floor[i] =
-            FRAC_SCALING * sqrt(noise_floor * PSHR32(noise[i], NOISE_SHIFT) + echo_floor * echo[i]) / sqrt(1 + PSHR32(noise[i], NOISE_SHIFT) + echo[i]);
+            FRAC_SCALING * sqrtf(noise_floor * PSHR32(noise[i], NOISE_SHIFT) + echo_floor * echo[i]) / sqrtf(1.0f + PSHR32(noise[i], NOISE_SHIFT) + echo[i]);
 }
 
 #endif
@@ -196,14 +196,14 @@ static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
         2.69551f, 2.78647f, 2.87458f, 2.96015f, 3.04333f, 3.12431f, 3.20326f
     };
     x = EXPIN_SCALING_1 * xx;
-    integer = floor(2 * x);
+    integer = floorf(2 * x);
     ind = (int) integer;
     if (ind < 0)
         return FRAC_SCALING;
     if (ind > 19)
-        return FRAC_SCALING * (1 + .1296 / x);
-    frac = 2 * x - integer;
-    return FRAC_SCALING * ((1 - frac) * table[ind] + frac * table[ind + 1]) / sqrt(x + .0001f);
+        return FRAC_SCALING * (1.0f + .1296f / x);
+    frac = 2.0f * x - integer;
+    return FRAC_SCALING * ((1.0f - frac) * table[ind] + frac * table[ind + 1]) / sqrtf(x + .0001f);
 }
 #endif
 
@@ -257,7 +257,7 @@ static void update_gains_critical_bands(SpeexPreprocessState * st, spx_word16_t
             EXTRACT16(PSHR32(MULT16_16(PDIV32_16(SHL32(EXTEND32(q), 8), (Q15_ONE - q)), tmp), 8));
         st->gain2[i] = DIV32_16(SHL32(EXTEND32(32767), SNR_SHIFT), ADD16(256, tmp));
 #else
-        st->gain2[i] = 1 / (1.f + (q / (1.f - q)) * (1 + st->prior[i]) * exp(-theta));
+        st->gain2[i] = 1.0f / (1.f + (q / (1.f - q)) * (1.0f + st->prior[i]) * expf(-theta));
 #endif
     }
 }
diff --git a/lib/speexdsp/libspeexdsp/resample.c b/lib/speexdsp/libspeexdsp/resample.c
index ecab865..6b36efd 100644
--- a/lib/speexdsp/libspeexdsp/resample.c
+++ b/lib/speexdsp/libspeexdsp/resample.c
@@ -82,7 +82,7 @@ static void speex_free(void *ptr) {free(ptr);}
 #include <limits.h>
 
 #ifndef M_PI
-#define M_PI 3.14159265358979323846
+#define M_PI 3.14159265358979323846f
 #endif
 
 #define IMAX(a,b) ((a) > (b) ? (a) : (b))
diff --git a/lib/speexdsp/libspeexdsp/scal.c b/lib/speexdsp/libspeexdsp/scal.c
index cb04706..2517c5d 100644
--- a/lib/speexdsp/libspeexdsp/scal.c
+++ b/lib/speexdsp/libspeexdsp/scal.c
@@ -53,7 +53,7 @@ The algorithm implemented here is described in:
 #include <stdlib.h>
 
 #ifndef M_PI
-#define M_PI           3.14159265358979323846  /* pi */
+#define M_PI           3.14159265358979323846f  /* pi */
 #endif
 
 #define ALLPASS_ORDER 20
diff --git a/main.c b/main.c
index 5fa76f2..c5465c4 100755
--- a/main.c
+++ b/main.c
@@ -103,7 +103,7 @@ main(void)
         {
             break;
         }
-    } while (dt < 1e6);
+    } while (dt < 1e6f);
 
     if (err)
     {
@@ -112,7 +112,7 @@ main(void)
     }
 
     // Must run for 10 sec. or at least 10 iterations
-    float scale = 11e6 / dt;
+    float scale = 11e6f / dt;
     iterations  = (uint32_t)((float)iterations * scale);
     iterations  = iterations < 10 ? 10 : iterations;
 
@@ -132,7 +132,7 @@ main(void)
      * the pipeline runs. x 1000 to make it a bigger number.
      */
     float sec   = (float)dt / 1.0e6f;
-    float score = (float)iterations / sec * 1000.f * (1 / 1.5f);
+    float score = (float)iterations / sec * 1000.f * (1.0f / 1.5f);
 
     printf("Total runtime    : %.3f seconds\n", sec);
     printf("Total iterations : %d iterations\n", iterations);
diff --git a/src/ee_mfcc_f32.h b/src/ee_mfcc_f32.h
index 04bc038..69b9525 100644
--- a/src/ee_mfcc_f32.h
+++ b/src/ee_mfcc_f32.h
@@ -41,8 +41,8 @@ extern const ee_f32_t ee_mfcc_filter_coefs_f32[EE_NUM_MFCC_FILTER_COEFS];
  * two values come from the trained model:
  * https://github.com/ARM-software/ML-zoo/tree/master/models/keyword_spotting/ds_cnn_small/tflite_int8
  */
-#define MFCC_SCALE        (1.0f / 1.084193468093872)
-#define MFCC_OFFSET       ((float)100)
+#define MFCC_SCALE        (1.0f / 1.084193468093872f)
+#define MFCC_OFFSET       (100.f)
 #define NUM_MFCC_FEATURES 10
 // frame_len_padded = pow(2,ceil((log(FRAME_LEN)/log(2)))); == 1024
 #define PADDED_FRAME_LEN 1024

From d53b9e829327bb111eb0158a92ff44c4ee9bf7a7 Mon Sep 17 00:00:00 2001
From: Joseph Yiu <77114984+joseph-yiu@users.noreply.github.com>
Date: Tue, 11 Jun 2024 21:00:57 +0100
Subject: [PATCH 4/7] Remove unused SNR ratios

---
 tests/test_kws.c | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/tests/test_kws.c b/tests/test_kws.c
index b586f45..1278cdb 100644
--- a/tests/test_kws.c
+++ b/tests/test_kws.c
@@ -23,12 +23,8 @@
 #define NCLASSES 12
 
 /* Noise to signal ratio */
-#define NSRM50DB 0.003162f
-#define NSRM40DB 0.01f
 #define NSRM35DB 0.017783f
-#define NSRM30DB 0.03162f
 
-#define USE_NSRM35DB 1
 //#define DEBUG_EXACT_BITS
 #define MAX(a,b) (((a)>(b))?(a):(b))
 
@@ -133,34 +129,11 @@ main(int argc, char *argv[])
 #endif
             }
             ratio = (float)B / (float)A; /* Noise to signal ratio */
-#ifdef USE_NSRM50DB
-            if (ratio > NSRM50DB)
-            {
-                err = true;
-                printf("KWS FAIL: Inference #%d exceeded -50 dB SNR\n", i);
-            }
-#endif
-#ifdef USE_NSRM40DB
-            if (ratio > NSRM40DB)
-            {
-                err = true;
-                printf("KWS FAIL: Inference #%d exceeded -40 dB SNR\n", i);
-            }
-#endif
-#ifdef USE_NSRM35DB
             if (ratio > NSRM35DB)
             {
                 err = true;
                 printf("KWS FAIL: Inference #%d exceeded -35 dB SNR\n", i);
             }
-#endif
-#ifdef USE_NSRM30DB
-            if (ratio > NSRM30DB)
-            {
-                err = true;
-                printf("KWS FAIL: Inference #%d exceeded -30 dB SNR\n", i);
-            }
-#endif
 
         }
     }

From f220f548b20d626c9605ca2b58084001eb67cbda Mon Sep 17 00:00:00 2001
From: Joseph Yiu <77114984+joseph-yiu@users.noreply.github.com>
Date: Wed, 12 Jun 2024 10:03:35 +0100
Subject: [PATCH 5/7] Adding revision history

---
 README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/README.md b/README.md
index 894ced8..805b0d5 100755
--- a/README.md
+++ b/README.md
@@ -479,6 +479,23 @@ First, the 1000 factor is introduced to scale the score into a preferred integer
 
 3. A score collected from simulation cannot be submitted. The measured score must come from actual silicon. This includes CPU, MCU, MPU, SoC, and FPGA prototypes.
 
+# Revision history
+
+- v1.0.0 First release (4 Feb 2023)
+- v1.0.1 (6 Feb 2023)
+  - Fixes a potential bug where a multiply may goes out of bound
+- v1.0.2 (6 Sept 2023)
+  - Fix an issue in the KWS NN model. The reference C model was converted from the KWS NN TensorFlow Lite model, and there is a small lost of accuracy in the conversion process. After the code update, the reference C model fully match the original model.
+- v1.0.3 (June 2024)
+  - Fix incorrect use of restrict keyword https://github.com/eembc/audiomark/issues/61
+  - KWS unit test switched from bit exact checking to Noise to Signal ratio checking.
+  - Most of the double precision floating-point operations in SpeexLib library code replaced by single precision.
+  - Changes in unit test to clarify the use of Noise-to-Signal ratio instead of Signal-to-Noise (SNR) ratio
+  - Documentation update: Score calculation formula uses 1/1.5 scaling rather than 1*1.5.
+  - Documentation update: Clarify that only float version of SpeexDSP is supported.
+  - Documentation update: KWS unit test description (changed to use Noise-to-Signal ratio)
+  - Other minor documentation improvements.
+
 # Credits
 
 This benchmark would not have been possible without the commitment and contributions of the working group members, and the assistance from various domain experts, including (sorted by given name):

From c01d513c78b740c6d4b148fc69e92c66d8c276f1 Mon Sep 17 00:00:00 2001
From: Joseph Yiu <77114984+joseph-yiu@users.noreply.github.com>
Date: Thu, 13 Jun 2024 14:01:01 +0100
Subject: [PATCH 6/7] Update README.md - highlight cache maintenance info

---
 platform/cmsis/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/platform/cmsis/README.md b/platform/cmsis/README.md
index 24a8f73..1041e3c 100755
--- a/platform/cmsis/README.md
+++ b/platform/cmsis/README.md
@@ -267,3 +267,5 @@ Score            : 607.506470 AudioMarks
  - For Corstone-310, small TCMs prevent Code and Data to fit in these. Internal SRAM are used and benchmarks will run with caches enabled. MPS3 FPGA system clock frequency runs at `25Mhz`
  - For MPS2+ Cortex-M33 IoTKit, default system clock frequency runs at `20Mhz`
  - For MPS2+ Cortex-M4/Cortex-M7, CMSDK default system clock frequency runs at `25Mhz`
+ - For platforms utilizing the Ethos-U NPU and running the AudioMark DS-CNN network, it is assumed that, aside from the storage for Neural Network MFCC input and Soft Decisions output - which may be situated in cacheable memories necessitating cache maintenance operations - the rest of the shared Cortex-M and Ethos-U data are **Read-Only**. Consequently, these do not require cache clean and invalidation operations, allowing saving processing cycles. However, in the standard Ethos-U handling procedures, this approach may not be recommended. The TensorFlow Lite Micro runtime could organize and manage memory allocation in a different manner, involving write-accesses by both CPU and NPU, necessitating the use of appropriate cache operations to ensure data coherency between the CPU and NPU. For reference, typical cache and invalidation routines employed in general scenarios can be found in the ML Evaluation Kit CPU cache handling source code.
+    - https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ml-embedded-evaluation-kit/+/refs/heads/main/source/hal/source/components/npu/ethosu_cpu_cache.c

From c151b310543b86d7675d2dcc299aa0a238a49f35 Mon Sep 17 00:00:00 2001
From: Joseph Yiu <77114984+joseph-yiu@users.noreply.github.com>
Date: Fri, 14 Jun 2024 15:45:48 +0100
Subject: [PATCH 7/7] Fix typos and minor improvements in README.md

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 805b0d5..427dc31 100755
--- a/README.md
+++ b/README.md
@@ -21,11 +21,11 @@ The benchmark API facilitates hardware acceleration for key DSP and NN
 functionality. The file `ee_api.h` describes the functions that the system 
 integrator must implement. The components were derived from several sources:
 
-* The beaformer and direction of arrival algorithms (BF+DOA) were written and tested by Arm and Infineon.
-* The acoustic echo canceller (AEC) and audio noise  reduction (ANR) elements are implemented by the SpeeX libspeexdsp library. These functions utilize the SpeeX API, which is a combination of macros and functions that perform floating-point/fixed math operations, and an FFT wrapper for transformation. In AudioMark, only the single precision floating-point version of the library is used.
+* The beamformer and direction of arrival algorithms (BF+DOA) were written and tested by Arm and Infineon.
+* The acoustic echo canceller (AEC) and audio noise reduction (ANR) elements are implemented by the SpeeX libspeexdsp library. These functions utilize the SpeeX API, which is a combination of macros and functions that perform floating-point/fixed math operations, and an FFT wrapper for transformation. In AudioMark, only the single precision floating-point version of the library is used.
 * The neural net was derived from the [Arm Model Zoo DS CNN KWS](https://github.com/ARM-software/ML-zoo/tree/master/models/keyword_spotting/ds_cnn_small/model_package_tf/model_archive/TFLite/tflite_int8).
 
-This flexibility to utilize whatever hardware is available means the benchmark 
+This DS-CNN model was chosen for its flexibility of utilizing available hardware. It means the benchmark 
 scales across a wide variety of MCUs and SoCs.
 
 When possible, the components are implemented in 32-bit floating point, with the 
@@ -220,7 +220,7 @@ optimization through C with intrinsic or even assembly to reach peak
 performance.
 
 As a first example, the AEC power_spectrum routine, which is essentially 
-computing the squared magnitude of a complex signal, could use the CMSIS DSP 
+computing the squared magnitude of a complex signal, could use the CMSIS-DSP 
 `arm_cmplx_mag_squared_f32` function and for this defining the 
 `OVERRIDE_MDF_POWER_SPECTRUM` would deactivate original definition and use the 
 optimized variant that will be placed in the 
@@ -486,7 +486,7 @@ First, the 1000 factor is introduced to scale the score into a preferred integer
   - Fixes a potential bug where a multiply may goes out of bound
 - v1.0.2 (6 Sept 2023)
   - Fix an issue in the KWS NN model. The reference C model was converted from the KWS NN TensorFlow Lite model, and there is a small lost of accuracy in the conversion process. After the code update, the reference C model fully match the original model.
-- v1.0.3 (June 2024)
+- v1.0.3 (14 June 2024)
   - Fix incorrect use of restrict keyword https://github.com/eembc/audiomark/issues/61
   - KWS unit test switched from bit exact checking to Noise to Signal ratio checking.
   - Most of the double precision floating-point operations in SpeexLib library code replaced by single precision.